1 Star 0 Fork 163

junhe_arm/glibc

forked from src-openEuler/glibc 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
0011-Sw64-Integer-Operation-Support.patch 211.32 KB
一键复制 编辑 原始数据 按行查看 历史
swcompiler 提交于 2024-12-17 04:02 +08:00 . Sw64: Add Sw64 ISA support
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988
From 8045463341b2495da7b2e7dc308a023764315bbe Mon Sep 17 00:00:00 2001
From: swcompiler <lc@wxiat.com>
Date: Fri, 29 Nov 2024 14:15:45 +0800
Subject: [PATCH 11/23] Sw64: Integer Operation Support
---
sysdeps/sw_64/add_n.S | 118 +++++++++
sysdeps/sw_64/addmul_1.S | 89 +++++++
sysdeps/sw_64/bzero.S | 107 ++++++++
sysdeps/sw_64/div.S | 83 ++++++
sysdeps/sw_64/div_libc.h | 170 ++++++++++++
sysdeps/sw_64/divl.S | 96 +++++++
sysdeps/sw_64/divlu.S | 4 +
sysdeps/sw_64/divq.S | 290 +++++++++++++++++++++
sysdeps/sw_64/divqu.S | 292 +++++++++++++++++++++
sysdeps/sw_64/htonl.S | 43 +++
sysdeps/sw_64/htons.S | 39 +++
sysdeps/sw_64/ldiv.S | 222 ++++++++++++++++
sysdeps/sw_64/lldiv.S | 1 +
sysdeps/sw_64/lshift.S | 107 ++++++++
sysdeps/sw_64/mul_1.S | 82 ++++++
sysdeps/sw_64/reml.S | 93 +++++++
sysdeps/sw_64/remlu.S | 4 +
sysdeps/sw_64/remq.S | 274 ++++++++++++++++++++
sysdeps/sw_64/remqu.S | 292 +++++++++++++++++++++
sysdeps/sw_64/rshift.S | 105 ++++++++
sysdeps/sw_64/sub_n.S | 118 +++++++++
sysdeps/sw_64/submul_1.S | 89 +++++++
sysdeps/sw_64/sw6a/add_n.S | 146 +++++++++++
sysdeps/sw_64/sw6a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
sysdeps/sw_64/sw6a/lshift.S | 172 ++++++++++++
sysdeps/sw_64/sw6a/rshift.S | 170 ++++++++++++
sysdeps/sw_64/sw6a/sub_n.S | 147 +++++++++++
sysdeps/sw_64/sw6b/add_n.S | 146 +++++++++++
sysdeps/sw_64/sw6b/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
sysdeps/sw_64/sw6b/lshift.S | 172 ++++++++++++
sysdeps/sw_64/sw6b/memcpy.S | 416 +++++++++++++++++++++++++++++
sysdeps/sw_64/sw6b/memset.S | 312 ++++++++++++++++++++++
sysdeps/sw_64/sw6b/rshift.S | 170 ++++++++++++
sysdeps/sw_64/sw6b/stxcpy.S | 314 ++++++++++++++++++++++
sysdeps/sw_64/sw6b/stxncpy.S | 392 ++++++++++++++++++++++++++++
sysdeps/sw_64/sw6b/sub_n.S | 147 +++++++++++
sysdeps/sw_64/sw8a/add_n.S | 146 +++++++++++
sysdeps/sw_64/sw8a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
sysdeps/sw_64/sw8a/lshift.S | 172 ++++++++++++
sysdeps/sw_64/sw8a/rshift.S | 170 ++++++++++++
sysdeps/sw_64/sw8a/sub_n.S | 147 +++++++++++
sysdeps/sw_64/udiv_qrnnd.S | 159 ++++++++++++
42 files changed, 7641 insertions(+)
create mode 100644 sysdeps/sw_64/add_n.S
create mode 100644 sysdeps/sw_64/addmul_1.S
create mode 100644 sysdeps/sw_64/bzero.S
create mode 100644 sysdeps/sw_64/div.S
create mode 100644 sysdeps/sw_64/div_libc.h
create mode 100644 sysdeps/sw_64/divl.S
create mode 100644 sysdeps/sw_64/divlu.S
create mode 100644 sysdeps/sw_64/divq.S
create mode 100644 sysdeps/sw_64/divqu.S
create mode 100644 sysdeps/sw_64/htonl.S
create mode 100644 sysdeps/sw_64/htons.S
create mode 100644 sysdeps/sw_64/ldiv.S
create mode 100644 sysdeps/sw_64/lldiv.S
create mode 100644 sysdeps/sw_64/lshift.S
create mode 100644 sysdeps/sw_64/mul_1.S
create mode 100644 sysdeps/sw_64/reml.S
create mode 100644 sysdeps/sw_64/remlu.S
create mode 100644 sysdeps/sw_64/remq.S
create mode 100644 sysdeps/sw_64/remqu.S
create mode 100644 sysdeps/sw_64/rshift.S
create mode 100644 sysdeps/sw_64/sub_n.S
create mode 100644 sysdeps/sw_64/submul_1.S
create mode 100644 sysdeps/sw_64/sw6a/add_n.S
create mode 100644 sysdeps/sw_64/sw6a/addmul_1.S
create mode 100644 sysdeps/sw_64/sw6a/lshift.S
create mode 100644 sysdeps/sw_64/sw6a/rshift.S
create mode 100644 sysdeps/sw_64/sw6a/sub_n.S
create mode 100644 sysdeps/sw_64/sw6b/add_n.S
create mode 100644 sysdeps/sw_64/sw6b/addmul_1.S
create mode 100644 sysdeps/sw_64/sw6b/lshift.S
create mode 100644 sysdeps/sw_64/sw6b/memcpy.S
create mode 100644 sysdeps/sw_64/sw6b/memset.S
create mode 100644 sysdeps/sw_64/sw6b/rshift.S
create mode 100644 sysdeps/sw_64/sw6b/stxcpy.S
create mode 100644 sysdeps/sw_64/sw6b/stxncpy.S
create mode 100644 sysdeps/sw_64/sw6b/sub_n.S
create mode 100644 sysdeps/sw_64/sw8a/add_n.S
create mode 100644 sysdeps/sw_64/sw8a/addmul_1.S
create mode 100644 sysdeps/sw_64/sw8a/lshift.S
create mode 100644 sysdeps/sw_64/sw8a/rshift.S
create mode 100644 sysdeps/sw_64/sw8a/sub_n.S
create mode 100644 sysdeps/sw_64/udiv_qrnnd.S
diff --git a/sysdeps/sw_64/add_n.S b/sysdeps/sw_64/add_n.S
new file mode 100644
index 00000000..8c5c8c08
--- /dev/null
+++ b/sysdeps/sw_64/add_n.S
@@ -0,0 +1,118 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ ldl $3,0($17)
+ ldl $4,0($18)
+
+ subl $19,1,$19
+ and $19,4-1,$2 # number of limbs in first loop
+ bis $31,$31,$0
+ beq $2,.L0 # if fmuldiple of 4 limbs, skip first loop
+
+ subl $19,$2,$19
+
+.Loop0: subl $2,1,$2
+ ldl $5,8($17)
+ addl $4,$0,$4
+ ldl $6,8($18)
+ cmpult $4,$0,$1
+ addl $3,$4,$4
+ cmpult $4,$3,$0
+ stl $4,0($16)
+ or $0,$1,$0
+
+ addl $17,8,$17
+ addl $18,8,$18
+ bis $5,$5,$3
+ bis $6,$6,$4
+ addl $16,8,$16
+ bne $2,.Loop0
+
+.L0: beq $19,.Lend
+
+ .align 3
+.Loop: subl $19,4,$19
+
+ ldl $5,8($17)
+ addl $4,$0,$4
+ ldl $6,8($18)
+ cmpult $4,$0,$1
+ addl $3,$4,$4
+ cmpult $4,$3,$0
+ stl $4,0($16)
+ or $0,$1,$0
+
+ ldl $3,16($17)
+ addl $6,$0,$6
+ ldl $4,16($18)
+ cmpult $6,$0,$1
+ addl $5,$6,$6
+ cmpult $6,$5,$0
+ stl $6,8($16)
+ or $0,$1,$0
+
+ ldl $5,24($17)
+ addl $4,$0,$4
+ ldl $6,24($18)
+ cmpult $4,$0,$1
+ addl $3,$4,$4
+ cmpult $4,$3,$0
+ stl $4,16($16)
+ or $0,$1,$0
+
+ ldl $3,32($17)
+ addl $6,$0,$6
+ ldl $4,32($18)
+ cmpult $6,$0,$1
+ addl $5,$6,$6
+ cmpult $6,$5,$0
+ stl $6,24($16)
+ or $0,$1,$0
+
+ addl $17,32,$17
+ addl $18,32,$18
+ addl $16,32,$16
+ bne $19,.Loop
+
+.Lend: addl $4,$0,$4
+ cmpult $4,$0,$1
+ addl $3,$4,$4
+ cmpult $4,$3,$0
+ stl $4,0($16)
+ or $0,$1,$0
+ ret $31,($26),1
+
+ .end __mpn_add_n
diff --git a/sysdeps/sw_64/addmul_1.S b/sysdeps/sw_64/addmul_1.S
new file mode 100644
index 00000000..138e3c69
--- /dev/null
+++ b/sysdeps/sw_64/addmul_1.S
@@ -0,0 +1,89 @@
+ # Sw_64 1621 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1 2
+__mpn_addmul_1:
+ .frame $30,0,$26
+
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ subl $18,1,$18 # size--
+ mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,.Lend1 # jump if size was == 1
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ subl $18,1,$18 # size--
+ addl $5,$3,$3
+ cmpult $3,$5,$4
+ stl $3,0($16)
+ addl $16,8,$16 # res_ptr++
+ beq $18,.Lend2 # jump if size was == 2
+
+ .align 3
+.Loop: mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subl $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addl $5,$3,$3
+ cmpult $3,$5,$5
+ stl $3,0($16)
+ addl $16,8,$16 # res_ptr++
+ addl $5,$0,$0 # combine carries
+ bne $18,.Loop
+
+.Lend2: mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addl $5,$3,$3
+ cmpult $3,$5,$5
+ stl $3,0($16)
+ addl $5,$0,$0 # combine carries
+ addl $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+.Lend1: addl $5,$3,$3
+ cmpult $3,$5,$5
+ stl $3,0($16)
+ addl $0,$5,$0
+ ret $31,($26),1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/sw_64/bzero.S b/sysdeps/sw_64/bzero.S
new file mode 100644
index 00000000..1a020afd
--- /dev/null
+++ b/sysdeps/sw_64/bzero.S
@@ -0,0 +1,107 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Fill a block of memory with zeros. Optimized for the Sw_64 architecture:
+
+ - memory accessed as aligned quadwords only
+ - destination memory not read unless needed for good cache behaviour
+ - basic blocks arranged to optimize branch prediction for full-quadword
+ aligned memory blocks.
+ - partial head and tail quadwords constructed with byte-mask instructions
+
+*/
+
+
+#include <sysdep.h>
+
+ .set noat
+ .set noreorder
+
+ .text
+ .type __bzero, @function
+ .globl __bzero
+ .usepv __bzero, USEPV_PROF
+
+ cfi_startproc
+
+ /* On entry to this basic block:
+ t3 == loop counter
+ t4 == bytes in partial final word
+ a0 == possibly misaligned destination pointer */
+
+ .align 3
+bzero_loop:
+ beq t3, $tail #
+ blbc t3, 0f # skip single store if count even
+
+ stl_u zero, 0(a0) # e0 : store one word
+ subl t3, 1, t3 # .. e1 :
+ addl a0, 8, a0 # e0 :
+ beq t3, $tail # .. e1 :
+
+0: stl_u zero, 0(a0) # e0 : store two words
+ subl t3, 2, t3 # .. e1 :
+ stl_u zero, 8(a0) # e0 :
+ addl a0, 16, a0 # .. e1 :
+ bne t3, 0b # e1 :
+
+$tail: bne t4, 1f # is there a tail to do?
+ ret # no
+
+1: ldl_u t0, 0(a0) # yes, load original data
+ mask7b t0, t4, t0 #
+ stl_u t0, 0(a0) #
+ ret #
+
+__bzero:
+#ifdef PROF
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+#endif
+
+ mov a0, v0 # e0 : move return value in place
+ beq a1, $done # .. e1 : early exit for zero-length store
+ and a0, 7, t1 # e0 :
+ addl a1, t1, a1 # e1 : add dest misalignment to count
+ srl a1, 3, t3 # e0 : loop = count >> 3
+ and a1, 7, t4 # .. e1 : find number of bytes in tail
+ unop # :
+ beq t1, bzero_loop # e1 : aligned head, jump right in
+
+ ldl_u t0, 0(a0) # e0 : load original data to mask into
+ cmpult a1, 8, t2 # .. e1 : is this a sub-word set
+ bne t2, $oneq # e1 :
+
+ mask3b t0, a0, t0 # e0 : we span words. finish this partial
+ subl t3, 1, t3 # e0 :
+ addl a0, 8, a0 # .. e1 :
+ stl_u t0, -8(a0) # e0 :
+ br bzero_loop # .. e1 :
+
+ .align 3
+$oneq:
+ mask3b t0, a0, t2 # e0 :
+ mask7b t0, a1, t3 # e0 :
+ or t2, t3, t0 # e1 :
+ stl_u t0, 0(a0) # e0 :
+
+$done: ret
+
+ cfi_endproc
+weak_alias (__bzero, bzero)
diff --git a/sysdeps/sw_64/div.S b/sysdeps/sw_64/div.S
new file mode 100644
index 00000000..6dbdcb7f
--- /dev/null
+++ b/sysdeps/sw_64/div.S
@@ -0,0 +1,83 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Richard Henderson <rth@tamu.edu>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+#undef FRAME
+#ifdef __sw_64_fix__
+#define FRAME 0
+#else
+#define FRAME 16
+#endif
+
+ .set noat
+
+ .align 4
+ .globl div
+ .ent div
+div:
+ .frame sp, FRAME, ra
+#if FRAME > 0
+ ldi sp, -FRAME(sp)
+#endif
+#ifdef PROF
+ .set macro
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set nomacro
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ beq $18, $divbyzero
+ rfpcr $f10
+ _ITOFT2 $17, $f0, 0, $18, $f1, 8
+ fcvtld $f0, $f11
+ fcvtld $f1, $f12
+ fdivd $f11, $f12, $f1
+ fcvtdl_z $f1, $f0
+ wfpcr $f10
+ _FTOIT $f0, $0, 0
+
+ mulw $0, $18, $1
+ subw $17, $1, $1
+
+ stw $0, 0(a0)
+ stw $1, 4(a0)
+ mov a0, v0
+
+#if FRAME > 0
+ ldi sp, FRAME(sp)
+#endif
+ ret
+
+$divbyzero:
+ mov a0, v0
+ ldi a0, GEN_INTDIV
+ sys_call HMC_gentrap
+ stw zero, 0(v0)
+ stw zero, 4(v0)
+
+#if FRAME > 0
+ ldi sp, FRAME(sp)
+#endif
+ ret
+
+ .end div
diff --git a/sysdeps/sw_64/div_libc.h b/sysdeps/sw_64/div_libc.h
new file mode 100644
index 00000000..2066924b
--- /dev/null
+++ b/sysdeps/sw_64/div_libc.h
@@ -0,0 +1,170 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Common bits for implementing software divide. */
+
+#include <sysdep.h>
+#ifdef __linux__
+# include <asm/gentrap.h>
+# include <asm/hmcall.h>
+#else
+# include <machine/pal.h>
+#endif
+
+/* These are not normal C functions. Argument registers are t10 and t11;
+ the result goes in t12; the return address is in t9. Only t12 and AT
+ may be clobbered. */
+#define X t10
+#define Y t11
+#define RV t12
+#define RA t9
+
+/* The secureplt format does not allow the division routines to be called
+ via plt; there aren't enough registers free to be clobbered. Avoid
+ setting the symbol type to STT_FUNC, so that the linker won't be tempted
+ to create a plt entry. */
+#define funcnoplt notype
+
+/* None of these functions should use implicit anything. */
+ .set nomacro
+ .set noat
+
+/* Code fragment to invoke _mcount for profiling. This should be invoked
+ directly after allocation of the stack frame. */
+.macro CALL_MCOUNT
+#ifdef PROF
+ stl ra, 0(sp)
+ stl pv, 8(sp)
+ stl gp, 16(sp)
+ cfi_rel_offset (ra, 0)
+ cfi_rel_offset (pv, 8)
+ cfi_rel_offset (gp, 16)
+ br AT, 1f
+ .set macro
+1: ldgp gp, 0(AT)
+ mov RA, ra
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set nomacro
+ ldl ra, 0(sp)
+ ldl pv, 8(sp)
+ ldl gp, 16(sp)
+ cfi_restore (ra)
+ cfi_restore (pv)
+ cfi_restore (gp)
+ /* Realign subsequent code with what we'd have without this
+ macro at all. This means aligned with one arithmetic insn
+ used within the bundle. */
+ .align 4
+ nop
+#endif
+.endm
+
+/* In order to make the below work, all top-level divide routines must
+ use the same frame size. */
+#define FRAME 96
+
+/* Code fragment to generate an integer divide-by-zero fault. When
+ building libc.so, we arrange for there to be one copy of this code
+ placed late in the dso, such that all branches are forward. When
+ building libc.a, we use multiple copies to avoid having an out of
+ range branch. Users should jump to DIVBYZERO. */
+
+.macro DO_DIVBYZERO
+#ifdef PIC
+#define DIVBYZERO __divbyzero
+ .section .gnu.linkonce.t.divbyzero, "ax", @progbits
+ .globl __divbyzero
+ .type __divbyzero, @function
+ .usepv __divbyzero, no
+ .hidden __divbyzero
+#else
+#define DIVBYZERO $divbyzero
+#endif
+
+ .align 4
+DIVBYZERO:
+ cfi_startproc
+ cfi_return_column (RA)
+ cfi_def_cfa_offset (FRAME)
+
+ mov a0, RV
+ unop
+ ldi a0, GEN_INTDIV
+ sys_call HMC_gentrap
+
+ mov RV, a0
+ clr RV
+ ldi sp, FRAME(sp)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ cfi_endproc
+ .size DIVBYZERO, .-DIVBYZERO
+.endm
+
+/* Like the sw6a instructions, but fall back to stack use on prior machines. */
+#ifdef __sw_64_sw6a__
+ .arch sw6a
+#endif
+#ifdef __sw_64_sw6b__
+ .arch sw6b
+#endif
+#ifdef __sw_64_sw8a__
+ .arch sw8a
+#endif
+
+.macro _ITOFS gr, fr, slot
+#ifdef __sw_64_fix__
+ ifmovs \gr, \fr
+#else
+ stw \gr, \slot(sp)
+ flds \fr, \slot(sp)
+#endif
+.endm
+
+.macro _ITOFT gr, fr, slot
+#ifdef __sw_64_fix__
+ ifmovd \gr, \fr
+#else
+ stl \gr, \slot(sp)
+ fldd \fr, \slot(sp)
+#endif
+.endm
+
+.macro _FTOIT fr, gr, slot
+#ifdef __sw_64_fix__
+ fimovd \fr, \gr
+#else
+ fstd \fr, \slot(sp)
+ ldl \gr, \slot(sp)
+#endif
+.endm
+
+/* Similarly, but move two registers. Schedules better for pre-sw6a. */
+
+.macro _ITOFT2 gr1, fr1, slot1, gr2, fr2, slot2
+#ifdef __sw_64_fix__
+ ifmovd \gr1, \fr1
+ ifmovd \gr2, \fr2
+#else
+ stl \gr1, \slot1(sp)
+ stl \gr2, \slot2(sp)
+ fldd \fr1, \slot1(sp)
+ fldd \fr2, \slot2(sp)
+#endif
+.endm
diff --git a/sysdeps/sw_64/divl.S b/sysdeps/sw_64/divl.S
new file mode 100644
index 00000000..1192a0aa
--- /dev/null
+++ b/sysdeps/sw_64/divl.S
@@ -0,0 +1,96 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+/* 32-bit signed int divide. This is not a normal C function. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
+ be clobbered.
+
+ The FPU can handle all input values except zero. Whee!
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+
+ /*****************************************************************
+ # *
+ # transform to sw-instruct on 2016111216 *
+ # *
+ #****************************************************************/
+
+#ifndef EXTEND
+#define EXTEND(S,D) sextl S, D
+#endif
+
+ .text
+ .align 4
+ .globl __divw
+ .type __divw, @funcnoplt
+ .usepv __divw, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__divw:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+ fstd $f0, 0(sp)
+ excb
+ beq Y, DIVBYZERO
+
+ fstd $f1, 8(sp)
+ fstd $f2, 16(sp)
+ fstd $f3, 40(sp)
+ fstd $f4, 48(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f2, 16)
+ cfi_rel_offset ($f3, 40)
+ cfi_rel_offset ($f4, 48)
+
+ rfpcr $f2
+ EXTEND (X, RV)
+ EXTEND (Y, AT)
+ _ITOFT2 RV, $f0, 24, AT, $f1, 32
+ fcvtld $f0, $f3
+ fcvtld $f1, $f4
+ fdivd $f3, $f4, $f1
+ fcvtdl_z $f1, $f0
+ wfpcr $f2
+ _FTOIT $f0, RV, 24
+
+ fldd $f0, 0(sp)
+ fldd $f1, 8(sp)
+ fldd $f2, 16(sp)
+ fldd $f3, 40(sp)
+ fldd $f4, 48(sp)
+ ldi sp, FRAME(sp)
+ cfi_restore ($f0)
+ cfi_restore ($f1)
+ cfi_restore ($f2)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_def_cfa_offset (0)
+ sextl RV, RV
+ ret $31, (RA), 1
+
+ cfi_endproc
+ .size __divw, .-__divw
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/divlu.S b/sysdeps/sw_64/divlu.S
new file mode 100644
index 00000000..26e1842f
--- /dev/null
+++ b/sysdeps/sw_64/divlu.S
@@ -0,0 +1,4 @@
+#define UNSIGNED
+#define EXTEND(S,D) zapnot S, 15, D
+#define __divw __divwu
+#include <divl.S>
diff --git a/sysdeps/sw_64/divq.S b/sysdeps/sw_64/divq.S
new file mode 100644
index 00000000..61ef58b4
--- /dev/null
+++ b/sysdeps/sw_64/divq.S
@@ -0,0 +1,290 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+
+/* 64-bit signed long divide. These are not normal C functions. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
+ be clobbered.
+
+ Theory of operation here is that we can use the FPU divider for virtually
+ all operands that we see: all dividend values between -2**53 and 2**53-1
+ can be computed directly. Note that divisor values need not be checked
+ against that range because the rounded fp value will be close enough such
+ that the quotient is < 1, which will properly be truncated to zero when we
+ convert back to integer.
+
+ When the dividend is outside the range for which we can compute exact
+ results, we use the fp quotent as an estimate from which we begin refining
+ an exact integral value. This reduces the number of iterations in the
+ shift-and-subtract loop significantly.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ /*****************************************************************
+ # *
+ # transform to sw-instruct on 2016111216 *
+ # *
+ #****************************************************************/
+ .text
+ .align 4
+ .globl __divl
+ .type __divl, @funcnoplt
+ .usepv __divl, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__divl:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+
+ /* Get the fp divide insn issued as quickly as possible. After
+ that's done, we have at least 22 cycles until its results are
+ ready -- all the time in the world to figure out how we're
+ going to use the results. */
+ fstd $f0, 0(sp)
+ excb
+ beq Y, DIVBYZERO
+
+ fstd $f1, 8(sp)
+ fstd $f3, 48(sp)
+ fstd $f4, 56(sp)
+ fstd $f5, 64(sp)
+
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f3, 48)
+ cfi_rel_offset ($f4, 56)
+ cfi_rel_offset ($f5, 64)
+ rfpcr $f3
+
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
+ fcvtld $f0, $f4
+ fcvtld $f1, $f5
+ fdivd $f4, $f5, $f0
+
+ /* Check to see if X fit in the double as an exact value. */
+ sll X, (64-53), AT
+ fldd $f1, 8(sp)
+ sra AT, (64-53), AT
+ cmpeq X, AT, AT
+ beq AT, $x_big
+ /* If we get here, we're expecting exact results from the division.
+ Do nothing else besides convert and clean up. */
+ fcvtdl_z $f0, $f4
+ excb
+
+ wfpcr $f3
+ _FTOIT $f4, RV, 16
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ cfi_restore ($f1)
+ cfi_remember_state
+ cfi_restore ($f0)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ldi sp, FRAME(sp)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+ stl t0, 32(sp)
+ stl t1, 40(sp)
+ stl t2, 16(sp)
+ stl t5, 24(sp)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+ cfi_rel_offset (t2, 16)
+ cfi_rel_offset (t5, 24)
+
+#define Q RV /* quotient */
+#define R t0 /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ /* The fixup code below can only handle unsigned values. */
+ or X, Y, AT
+ mov $31, t5
+ blt AT, $fix_sign_in
+$fix_sign_in_ret1:
+ fcvtdl_z $f0, $f4
+
+ _FTOIT $f4, Q, 8
+ .align 3
+$fix_sign_in_ret2:
+ fldd $f0, 0(sp)
+ stl t3, 0(sp)
+ cfi_restore ($f0)
+ cfi_rel_offset (t3, 0)
+
+ mull Q, Y, QY
+ excb
+ stl t4, 8(sp)
+ wfpcr $f3
+ cfi_rel_offset (t4, 8)
+
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ ldl t0, 32(sp)
+ ldl t1, 40(sp)
+ ldl t2, 16(sp)
+ bne t5, $fix_sign_out
+
+$fix_sign_out_ret:
+ ldl t3, 0(sp)
+ ldl t4, 8(sp)
+ ldl t5, 24(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore (t2)
+ cfi_restore (t3)
+ cfi_restore (t4)
+ cfi_restore (t5)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+$fix_sign_in:
+ /* If we got here, then X|Y is negative. Need to adjust everything
+ such that we're doing unsigned division in the fixup loop. */
+ /* T5 records the changes we had to make:
+ bit 0: set if result should be negative.
+ bit 2: set if X was negated.
+ bit 3: set if Y was negated.
+ */
+ xor X, Y, AT
+ cmplt AT, 0, t5
+ cmplt X, 0, AT
+ negl X, t0
+
+ s4addl AT, t5, t5
+ selne AT, t0, X, X
+ cmplt Y, 0, AT
+ negl Y, t0
+
+ s8addl AT, t5, t5
+ selne AT, t0, Y, Y
+ unop
+ blbc t5, $fix_sign_in_ret1
+
+ fcvtdl_z $f0, $f4
+ _FTOIT $f4, Q, 8
+ .align 3
+ negl Q, Q
+ br $fix_sign_in_ret2
+
+ .align 4
+$fix_sign_out:
+ /* Now we get to undo what we did above. */
+ /* ??? Is this really faster than just increasing the size of
+ the stack frame and storing X and Y in memory? */
+ and t5, 8, AT
+ negl Y, t4
+ selne AT, t4, Y, Y
+
+ and t5, 4, AT
+ negl X, t4
+ selne AT, t4, X, X
+
+ negl RV, t4
+ sellbs t5, t4, RV, RV
+
+ br $fix_sign_out_ret
+
+ cfi_endproc
+ .size __divl, .-__divl
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/divqu.S b/sysdeps/sw_64/divqu.S
new file mode 100644
index 00000000..7b39201e
--- /dev/null
+++ b/sysdeps/sw_64/divqu.S
@@ -0,0 +1,292 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+
+/* 64-bit unsigned long divide. These are not normal C functions. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may be
+ clobbered.
+
+ Theory of operation here is that we can use the FPU divider for virtually
+ all operands that we see: all dividend values between -2**53 and 2**53-1
+ can be computed directly. Note that divisor values need not be checked
+ against that range because the rounded fp value will be close enough such
+ that the quotient is < 1, which will properly be truncated to zero when we
+ convert back to integer.
+
+ When the dividend is outside the range for which we can compute exact
+ results, we use the fp quotent as an estimate from which we begin refining
+ an exact integral value. This reduces the number of iterations in the
+ shift-and-subtract loop significantly.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ /* transform to sw-instruct on 2016111216 */
+ .text
+ .align 4
+ .globl __divlu
+ .type __divlu, @funcnoplt
+ .usepv __divlu, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__divlu:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+
+ /* Get the fp divide insn issued as quickly as possible. After
+ that's done, we have at least 22 cycles until its results are
+ ready -- all the time in the world to figure out how we're
+ going to use the results. */
+ beq Y, DIVBYZERO
+ fstd $f0, 0(sp)
+ fstd $f1, 8(sp)
+ fstd $f3, 48(sp)
+ fstd $f4, 56(sp)
+ fstd $f5, 64(sp)
+ stl t0,32(sp)
+ stl t1,40(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f3, 48)
+ cfi_rel_offset ($f4, 56)
+ cfi_rel_offset ($f5, 64)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+
+ rfpcr $f3
+ /*add it for there has some err when with -mieee of
+ 0xffffffffffffffff/2*/
+ rfpcr $f1
+ fimovd $f1,t0
+ ldi t1,3
+ sll t1,58,t1
+ bic t0,t1,t0
+ ifmovd t0,$f1
+ wfpcr $f1
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
+ fcvtld $f0, $f4
+ fcvtld $f1, $f5
+ blt X, $x_is_neg
+ fdivd $f4, $f5, $f0
+
+ /* Check to see if Y was mis-converted as signed value. */
+ fldd $f1, 8(sp)
+ blt Y, $y_is_neg
+
+ /* Check to see if X fit in the double as an exact value. */
+ srl X, 53, AT
+ bne AT, $x_big
+
+ /* If we get here, we're expecting exact results from the division.
+ Do nothing else besides convert and clean up. */
+ fcvtdl $f0, $f4
+ wfpcr $f3
+ _FTOIT $f4, RV, 16
+
+ ldl t0,32(sp)
+ ldl t1,40(sp)
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore ($f0)
+ cfi_restore ($f1)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+$x_is_neg:
+ /* If we get here, X is so big that bit 63 is set, which made the
+ conversion come out negative. Fix it up lest we not even get
+ a good estimate. */
+ ldih AT, 0x5f80 /* 2**64 as float. */
+ fstd $f2, 24(sp)
+ fstd $f6, 72(sp)
+ cfi_rel_offset ($f2, 24)
+ cfi_rel_offset ($f5, 72)
+ _ITOFS AT, $f2, 16
+
+ .align 4
+ faddd $f4, $f2, $f6
+ unop
+ fdivd $f6, $f5, $f0
+ unop
+
+ /* Ok, we've now the divide issued. Continue with other checks. */
+ fldd $f1, 8(sp)
+ unop
+ fldd $f2, 24(sp)
+ fldd $f6, 72(sp)
+ blt Y, $y_is_neg
+ cfi_restore ($f1)
+ cfi_restore ($f2)
+ cfi_restore ($f6)
+ cfi_remember_state /* for y_is_neg */
+
+ .align 4
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+ stl t2, 16(sp)
+ stl t3, 24(sp)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+ cfi_rel_offset (t2, 16)
+ cfi_rel_offset (t3, 24)
+
+#define Q RV /* quotient */
+#define R t0 /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ fcvtdl $f0, $f4
+ _FTOIT $f4, Q, 8
+ mull Q, Y, QY
+
+ .align 4
+ stl t4, 8(sp)
+ excb
+ fldd $f0, 0(sp)
+ wfpcr $f3
+ cfi_rel_offset (t4, 8)
+ cfi_restore ($f0)
+
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ ldl t4, 8(sp)
+ ldl t0, 32(sp)
+ ldl t1, 40(sp)
+ ldl t2, 16(sp)
+
+ ldl t3, 24(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore (t2)
+ cfi_restore (t3)
+ cfi_restore (t4)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+ cfi_restore_state
+$y_is_neg:
+ /* If we get here, Y is so big that bit 63 is set. The results
+ from the divide will be completely wrong. Fortunately, the
+ quotient must be either 0 or 1, so just compute it directly. */
+ cmpule Y, X, RV
+ excb
+ wfpcr $f3
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldl t0,32(sp)
+ ldl t1,40(sp)
+ ldi sp, FRAME(sp)
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore ($f0)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+ cfi_endproc
+ .size __divlu, .-__divlu
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/htonl.S b/sysdeps/sw_64/htonl.S
new file mode 100644
index 00000000..7fc0aa24
--- /dev/null
+++ b/sysdeps/sw_64/htonl.S
@@ -0,0 +1,43 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(htonl)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ .set noat
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set at
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ ins6b a0, 7, t0 # t0 = 0000000000AABBCC
+ ins1b a0, 3, t1 # t1 = 000000CCDD000000
+ or t1, t0, t1 # t1 = 000000CCDDAABBCC
+ srl t1, 16, t2 # t2 = 0000000000CCDDAA
+ zapnot t1, 0x0A, t0 # t0 = 00000000DD00BB00
+ zapnot t2, 0x05, t3 # t3 = 0000000000CC00AA
+ addw t0, t3, v0 # v0 = ssssssssDDCCBBAA
+ ret
+
+ END(htonl)
+
+weak_alias (htonl, ntohl)
diff --git a/sysdeps/sw_64/htons.S b/sysdeps/sw_64/htons.S
new file mode 100644
index 00000000..8a981be1
--- /dev/null
+++ b/sysdeps/sw_64/htons.S
@@ -0,0 +1,39 @@
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(htons)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ .set noat
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set at
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ ext5b a0, 7, t1 # t1 = bb00
+ ext0b a0, 1, v0 # v0 = 00aa
+ bis v0, t1, v0 # v0 = bbaa
+ ret
+
+ END(htons)
+
+weak_alias (htons, ntohs)
diff --git a/sysdeps/sw_64/ldiv.S b/sysdeps/sw_64/ldiv.S
new file mode 100644
index 00000000..7a77d6dd
--- /dev/null
+++ b/sysdeps/sw_64/ldiv.S
@@ -0,0 +1,222 @@
+
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Richard Henderson <rth@tamu.edu>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include "div_libc.h"
+
+#undef FRAME
+#ifdef __sw_64_fix__
+#define FRAME 0
+#else
+#define FRAME 16
+#endif
+
+#undef X
+#undef Y
+#define X $17
+#define Y $18
+
+ .set noat
+
+ .align 4
+ .globl ldiv
+ .ent ldiv
+ldiv:
+ .frame sp, FRAME, ra
+#if FRAME > 0
+ ldi sp, -FRAME(sp)
+#endif
+#ifdef PROF
+ .set macro
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .set nomacro
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ beq Y, $divbyzero
+ mov Y,t6
+ nop
+ rfpcr $f10
+
+ _ITOFT2 X, $f0, 0, Y, $f1, 8
+
+ .align 4
+ fcvtld $f0, $f11
+ fcvtld $f1, $f12
+ fdivd $f11, $f12, $f0
+ unop
+
+ /* Check to see if X fit in the double as an exact value. */
+ sll X, (64-53), AT
+ sra AT, (64-53), AT
+ cmpeq X, AT, AT
+ beq AT, $x_big
+
+ /* If we get here, we're expecting exact results from the division.
+ Do nothing else besides convert and clean up. */
+ fcvtdl_z $f0, $f11
+ nop
+ wfpcr $f10
+ _FTOIT $f11, $0, 0
+
+$egress:
+// mull $0, Y, $1
+ mull $0, t6, $1
+ subl X, $1, $1
+
+ stl $0, 0($16)
+ stl $1, 8($16)
+ mov $16, $0
+
+#if FRAME > 0
+ ldi sp, FRAME(sp)
+#endif
+ ret
+
+ .align 4
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+
+#define Q v0 /* quotient */
+#define R t0 /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ /* The fixup code below can only handle unsigned values. */
+ bis X, Y, AT
+ mov $31, t5
+ blt AT, $fix_sign_in
+$fix_sign_in_ret1:
+ fcvtdl_z $f0, $f11
+
+ _FTOIT $f11, Q, 8
+$fix_sign_in_ret2:
+ mull Q, Y, QY
+ nop
+ wfpcr $f10
+
+ .align 4
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ negl Q, t4
+ sellbs t5, t4, Q, Q
+ br $egress
+
+ .align 4
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+$fix_sign_in:
+ /* If we got here, then X|Y is negative. Need to adjust everything
+ such that we're doing unsigned division in the fixup loop. */
+ /* T5 is true if result should be negative. */
+ xor X, Y, AT
+ cmplt AT, 0, t5
+ cmplt X, 0, AT
+ negl X, t0
+
+ selne AT, t0, X, X
+ cmplt Y, 0, AT
+ negl Y, t0
+
+ selne AT, t0, Y, Y
+ blbc t5, $fix_sign_in_ret1
+
+ fcvtdl_z $f0, $f11
+ _FTOIT $f11, Q, 8
+ .align 3
+ negl Q, Q
+ br $fix_sign_in_ret2
+
+$divbyzero:
+ mov a0, v0
+ ldi a0, GEN_INTDIV
+ sys_call HMC_gentrap
+ stl zero, 0(v0)
+ stl zero, 8(v0)
+
+#if FRAME > 0
+ ldi sp, FRAME(sp)
+#endif
+ ret
+
+ .end ldiv
+
+weak_alias (ldiv, lldiv)
+weak_alias (ldiv, imaxdiv)
diff --git a/sysdeps/sw_64/lldiv.S b/sysdeps/sw_64/lldiv.S
new file mode 100644
index 00000000..8a8ef97a
--- /dev/null
+++ b/sysdeps/sw_64/lldiv.S
@@ -0,0 +1 @@
+/* lldiv is the same as ldiv on the Sw_64. */
diff --git a/sysdeps/sw_64/lshift.S b/sysdeps/sw_64/lshift.S
new file mode 100644
index 00000000..700e9d80
--- /dev/null
+++ b/sysdeps/sw_64/lshift.S
@@ -0,0 +1,107 @@
+ # Sw_64 1621 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 4.8 cycles/limb on the 1621. With infinite unrolling,
+ # it would take 4 cycles/limb. It should be possible to get down to 3
+ # cycles/limb since both ldl and stl can be paired with the other used
+ # instructions. But there are many restrictions in the 1621 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addl $18,$17,$17 # make r17 point at end of s1
+ ldl $4,-8($17) # load first limb
+ subl $17,8,$17
+ subl $31,$19,$7
+ s8addl $18,$16,$16 # make r16 point at end of RES
+ subl $18,1,$18
+ and $18,4-1,$20 # number of limbs in first loop
+ srl $4,$7,$0 # compute function result
+
+ beq $20,.L0
+ subl $18,$20,$18
+
+ .align 3
+.Loop0:
+ ldl $3,-8($17)
+ subl $16,8,$16
+ subl $17,8,$17
+ subl $20,1,$20
+ sll $4,$19,$5
+ srl $3,$7,$6
+ bis $3,$3,$4
+ bis $5,$6,$8
+ stl $8,0($16)
+ bne $20,.Loop0
+
+.L0: beq $18,.Lend
+
+ .align 3
+.Loop: ldl $3,-8($17)
+ subl $16,32,$16
+ subl $18,4,$18
+ sll $4,$19,$5
+ srl $3,$7,$6
+
+ ldl $4,-16($17)
+ sll $3,$19,$1
+ bis $5,$6,$8
+ stl $8,24($16)
+ srl $4,$7,$2
+
+ ldl $3,-24($17)
+ sll $4,$19,$5
+ bis $1,$2,$8
+ stl $8,16($16)
+ srl $3,$7,$6
+
+ ldl $4,-32($17)
+ sll $3,$19,$1
+ bis $5,$6,$8
+ stl $8,8($16)
+ srl $4,$7,$2
+
+ subl $17,32,$17
+ bis $1,$2,$8
+ stl $8,0($16)
+
+ bgt $18,.Loop
+
+.Lend: sll $4,$19,$8
+ stl $8,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/sysdeps/sw_64/mul_1.S b/sysdeps/sw_64/mul_1.S
new file mode 100644
index 00000000..127f4274
--- /dev/null
+++ b/sysdeps/sw_64/mul_1.S
@@ -0,0 +1,82 @@
+ # Sw_64 1621 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+
+ # To improve performance for long fmuldiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Sw_64
+ # architecture. 2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR. Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_mul_1
+ .ent __mpn_mul_1 2
+__mpn_mul_1:
+ .frame $30,0,$26
+
+ ldl $2,0($17) # $2 = s1_limb
+ subl $18,1,$18 # size--
+ mull $2,$19,$3 # $3 = prod_low
+ bic $31,$31,$4 # clear cy_limb
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,Lend1 # jump if size was == 1
+ ldl $2,8($17) # $2 = s1_limb
+ subl $18,1,$18 # size--
+ stl $3,0($16)
+ beq $18,Lend2 # jump if size was == 2
+
+ .align 3
+Loop: mull $2,$19,$3 # $3 = prod_low
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subl $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldl $2,16($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ stl $3,8($16)
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ addl $16,8,$16 # res_ptr++
+ bne $18,Loop
+
+Lend2: mull $2,$19,$3 # $3 = prod_low
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ stl $3,8($16)
+ addl $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+Lend1: stl $3,0($16)
+ ret $31,($26),1
+
+ .end __mpn_mul_1
diff --git a/sysdeps/sw_64/reml.S b/sysdeps/sw_64/reml.S
new file mode 100644
index 00000000..56a550d9
--- /dev/null
+++ b/sysdeps/sw_64/reml.S
@@ -0,0 +1,93 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@twiddle.net>
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+/* 32-bit signed int remainder. This is not a normal C function. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
+ be clobbered.
+
+ The FPU can handle the division for all input values except zero.
+ All we have to do is compute the remainder via multiply-and-subtract.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ /*__reml->__remw 20161111*/
+#ifndef EXTEND
+#define EXTEND(S,D) sextl S, D
+#endif
+
+ .text
+ .align 4
+ .globl __remw
+ .type __remw, @funcnoplt
+ .usepv __remw, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__remw:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+ fstd $f0, 0(sp)
+ excb
+ beq Y, DIVBYZERO
+
+ fstd $f1, 8(sp)
+ fstd $f2, 16(sp)
+ fstd $f3, 40(sp)
+ fstd $f4, 48(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f2, 16)
+ cfi_rel_offset ($f3, 40)
+ cfi_rel_offset ($f4, 48)
+
+ rfpcr $f2
+ EXTEND (X, RV)
+ EXTEND (Y, AT)
+ _ITOFT2 RV, $f0, 24, AT, $f1, 32
+ fcvtld $f0, $f3
+ fcvtld $f1, $f4
+ fdivd $f3, $f4, $f0
+ fcvtdl_z $f0, $f3
+
+ wfpcr $f2
+ _FTOIT $f3, RV, 24
+ fldd $f0, 0(sp)
+ mulw RV, Y, RV
+ fldd $f1, 8(sp)
+ fldd $f2, 16(sp)
+ fldd $f3, 40(sp)
+ fldd $f4, 48(sp)
+ ldi sp, FRAME(sp)
+ cfi_restore ($f0)
+ cfi_restore ($f1)
+ cfi_restore ($f2)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_def_cfa_offset (0)
+ subw X, RV, RV
+ ret $31, (RA), 1
+
+ cfi_endproc
+ .size __remw, .-__remw
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/remlu.S b/sysdeps/sw_64/remlu.S
new file mode 100644
index 00000000..3c12f7bf
--- /dev/null
+++ b/sysdeps/sw_64/remlu.S
@@ -0,0 +1,4 @@
+#define UNSIGNED
+#define EXTEND(S,D) zapnot S, 15, D
+#define __remw __remwu
+#include <reml.S>
diff --git a/sysdeps/sw_64/remq.S b/sysdeps/sw_64/remq.S
new file mode 100644
index 00000000..6db7f628
--- /dev/null
+++ b/sysdeps/sw_64/remq.S
@@ -0,0 +1,274 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+
+/* 64-bit signed long remainder. These are not normal C functions. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
+ be clobbered.
+
+ Theory of operation here is that we can use the FPU divider for virtually
+ all operands that we see: all dividend values between -2**53 and 2**53-1
+ can be computed directly. Note that divisor values need not be checked
+ against that range because the rounded fp value will be close enough such
+ that the quotient is < 1, which will properly be truncated to zero when we
+ convert back to integer.
+
+ When the dividend is outside the range for which we can compute exact
+ results, we use the fp quotent as an estimate from which we begin refining
+ an exact integral value. This reduces the number of iterations in the
+ shift-and-subtract loop significantly.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ .text
+ .align 4
+ .globl __reml
+ .type __reml, @funcnoplt
+ .usepv __reml, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__reml:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+
+ /* Get the fp divide insn issued as quickly as possible. After
+ that's done, we have at least 22 cycles until its results are
+ ready -- all the time in the world to figure out how we're
+ going to use the results. */
+ fstd $f0, 0(sp)
+ excb
+ beq Y, DIVBYZERO
+
+ fstd $f1, 8(sp)
+ fstd $f3, 48(sp)
+ fstd $f4, 56(sp)
+ fstd $f5, 64(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f3, 48)
+ cfi_rel_offset ($f4, 56)
+ cfi_rel_offset ($f5, 64)
+
+ rfpcr $f3
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
+ fcvtld $f0, $f4
+ fcvtld $f1, $f5
+ fdivd $f4, $f5, $f0
+
+ /* Check to see if X fit in the double as an exact value. */
+ sll X, (64-53), AT
+ fldd $f1, 8(sp)
+ sra AT, (64-53), AT
+ cmpeq X, AT, AT
+ beq AT, $x_big
+ fcvtdl_z $f0, $f4
+
+ wfpcr $f3
+ _FTOIT $f4, AT, 16
+ mull AT, Y, AT
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ cfi_restore ($f1)
+ cfi_remember_state
+ cfi_restore ($f0)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ldi sp, FRAME(sp)
+ subl X, AT, RV
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+ stl t0, 32(sp)
+ stl t1, 40(sp)
+ stl t2, 16(sp)
+ stl t5, 24(sp)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+ cfi_rel_offset (t2, 16)
+ cfi_rel_offset (t5, 24)
+
+#define Q t0 /* quotient */
+#define R RV /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ /* The fixup code below can only handle unsigned values. */
+ or X, Y, AT
+ mov $31, t5
+ blt AT, $fix_sign_in
+$fix_sign_in_ret1:
+ fcvtdl_z $f0, $f4
+ _FTOIT $f4, Q, 8
+ .align 3
+$fix_sign_in_ret2:
+ fldd $f0, 0(sp)
+ stl t3, 0(sp)
+ cfi_restore ($f0)
+ cfi_rel_offset (t3, 0)
+
+ mull Q, Y, QY
+ stl t4, 8(sp)
+ wfpcr $f3
+ cfi_rel_offset (t4, 8)
+
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ ldl t0, 32(sp)
+ ldl t1, 40(sp)
+ ldl t2, 16(sp)
+ bne t5, $fix_sign_out
+
+$fix_sign_out_ret:
+ ldl t3, 0(sp)
+ ldl t4, 8(sp)
+ ldl t5, 24(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore (t2)
+ cfi_restore (t3)
+ cfi_restore (t4)
+ cfi_restore (t5)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+$fix_sign_in:
+ /* If we got here, then X|Y is negative. Need to adjust everything
+ such that we're doing unsigned division in the fixup loop. */
+ /* T5 records the changes we had to make:
+ bit 0: set if X was negated. Note that the sign of the
+ remainder follows the sign of the divisor.
+ bit 2: set if Y was negated.
+ */
+ xor X, Y, t1
+ cmplt X, 0, t5
+ negl X, t0
+ selne t5, t0, X, X
+
+ cmplt Y, 0, AT
+ negl Y, t0
+ s4addl AT, t5, t5
+ selne AT, t0, Y, Y
+
+ bge t1, $fix_sign_in_ret1
+ fcvtdl_z $f0, $f4
+ _FTOIT $f4, Q, 8
+ .align 3
+ negl Q, Q
+ br $fix_sign_in_ret2
+
+ .align 4
+$fix_sign_out:
+ /* Now we get to undo what we did above. */
+ /* ??? Is this really faster than just increasing the size of
+ the stack frame and storing X and Y in memory? */
+ and t5, 4, AT
+ negl Y, t4
+ selne AT, t4, Y, Y
+
+ negl X, t4
+ sellbs t5, t4, X, X
+ negl RV, t4
+ sellbs t5, t4, RV, RV
+
+ br $fix_sign_out_ret
+
+ cfi_endproc
+ .size __reml, .-__reml
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/remqu.S b/sysdeps/sw_64/remqu.S
new file mode 100644
index 00000000..946e031b
--- /dev/null
+++ b/sysdeps/sw_64/remqu.S
@@ -0,0 +1,292 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "div_libc.h"
+
+
+/* 64-bit unsigned long remainder. These are not normal C functions. Argument
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may be
+ clobbered.
+
+ Theory of operation here is that we can use the FPU divider for virtually
+ all operands that we see: all dividend values between -2**53 and 2**53-1
+ can be computed directly. Note that divisor values need not be checked
+ against that range because the rounded fp value will be close enough such
+ that the quotient is < 1, which will properly be truncated to zero when we
+ convert back to integer.
+
+ When the dividend is outside the range for which we can compute exact
+ results, we use the fp quotent as an estimate from which we begin refining
+ an exact integral value. This reduces the number of iterations in the
+ shift-and-subtract loop significantly.
+
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
+ for cvttq/c even without /sui being set. It will not, however, properly
+ raise the exception, so we don't have to worry about FPCR_INED being clear
+ and so dying by SIGFPE. */
+ .text
+ .align 4
+ .globl __remlu
+ .type __remlu, @funcnoplt
+ .usepv __remlu, no
+
+ cfi_startproc
+ cfi_return_column (RA)
+__remlu:
+ ldi sp, -FRAME(sp)
+ cfi_def_cfa_offset (FRAME)
+ CALL_MCOUNT
+
+ /* Get the fp divide insn issued as quickly as possible. After
+ that's done, we have at least 22 cycles until its results are
+ ready -- all the time in the world to figure out how we're
+ going to use the results. */
+ subl Y, 1, AT
+ and Y, AT, AT
+ beq AT, $powerof2
+ fstd $f0, 0(sp)
+
+
+ fstd $f1, 8(sp)
+ fstd $f3, 48(sp)
+ fstd $f4, 56(sp)
+ fstd $f5, 64(sp)
+ cfi_rel_offset ($f0, 0)
+ cfi_rel_offset ($f1, 8)
+ cfi_rel_offset ($f3, 48)
+ cfi_rel_offset ($f4, 56)
+ cfi_rel_offset ($f5, 64)
+
+ rfpcr $f3
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
+
+ fcvtld $f0, $f4
+ fcvtld $f1, $f5
+
+ blt X, $x_is_neg
+setfpec1
+ fdivd $f4, $f5, $f0
+
+ /* Check to see if Y was mis-converted as signed value. */
+ fldd $f1, 8(sp)
+ blt Y, $y_is_neg
+
+ /* Check to see if X fit in the double as an exact value. */
+ srl X, 53, AT
+ bne AT, $x_big
+
+ /* If we get here, we're expecting exact results from the division.
+ Do nothing else besides convert, compute remainder, clean up. */
+ fcvtdl_z $f0, $f4
+ wfpcr $f3
+ _FTOIT $f4, AT, 16
+ mull AT, Y, AT
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore ($f0)
+ cfi_restore ($f1)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+
+ .align 4
+ subl X, AT, RV
+ ret $31, (RA), 1
+ .align 4
+ cfi_restore_state
+$x_is_neg:
+ /* If we get here, X is so big that bit 63 is set, which made the
+ conversion come out negative. Fix it up lest we not even get
+ a good estimate. */
+ ldih AT, 0x5f80 /* 2**64 as float. */
+ fstd $f2, 24(sp)
+ fstd $f6, 72(sp)
+ cfi_rel_offset ($f2, 24)
+ cfi_rel_offset ($f6, 72)
+ _ITOFS AT, $f2, 16
+ .align 4
+ faddd $f4, $f2, $f6
+ fdivd $f6, $f5, $f0
+
+ /* Ok, we've now the divide issued. Continue with other checks. */
+# .align 4
+ fldd $f1, 8(sp)
+ unop
+ fldd $f2, 24(sp)
+ fldd $f6, 72(sp)
+ blt Y, $y_is_neg
+ cfi_restore ($f1)
+ cfi_restore ($f2)
+ cfi_restore ($f6)
+ cfi_remember_state /* for y_is_neg */
+
+ .align 4
+$x_big:
+ /* If we get here, X is large enough that we don't expect exact
+ results, and neither X nor Y got mis-translated for the fp
+ division. Our task is to take the fp result, figure out how
+ far it's off from the correct result and compute a fixup. */
+ stl t0, 32(sp)
+ stl t1, 40(sp)
+ stl t2, 16(sp)
+ stl t3, 24(sp)
+ cfi_rel_offset (t0, 32)
+ cfi_rel_offset (t1, 40)
+ cfi_rel_offset (t2, 16)
+ cfi_rel_offset (t3, 24)
+
+#define Q t0 /* quotient */
+#define R RV /* remainder */
+#define SY t1 /* scaled Y */
+#define S t2 /* scalar */
+#define QY t3 /* Q*Y */
+
+ fcvtdl_z $f0, $f4
+ _FTOIT $f4, Q, 8
+ mull Q, Y, QY
+
+ .align 4
+ stl t4, 8(sp)
+ excb
+ fldd $f0, 0(sp)
+ wfpcr $f3
+ cfi_rel_offset (t4, 8)
+ cfi_restore ($f0)
+
+ subl QY, X, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_high
+
+$q_high_ret:
+ subl X, QY, R
+ mov Y, SY
+ mov 1, S
+ bgt R, $q_low
+
+$q_low_ret:
+ ldl t4, 8(sp)
+ ldl t0, 32(sp)
+ ldl t1, 40(sp)
+ ldl t2, 16(sp)
+
+ ldl t3, 24(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ ldi sp, FRAME(sp)
+ cfi_remember_state
+ cfi_restore (t0)
+ cfi_restore (t1)
+ cfi_restore (t2)
+ cfi_restore (t3)
+ cfi_restore (t4)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ .align 4
+ cfi_restore_state
+ /* The quotient that we computed was too large. We need to reduce
+ it by S such that Y*S >= R. Obviously the closer we get to the
+ correct value the better, but overshooting high is ok, as we'll
+ fix that up later. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_high:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ subl Q, S, Q
+ unop
+ subl QY, SY, QY
+ br $q_high_ret
+
+ .align 4
+ /* The quotient that we computed was too small. Divide Y by the
+ current remainder (R) and add that to the existing quotient (Q).
+ The expectation, of course, is that R is much smaller than X. */
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
+ already have a copy of Y in SY and the value 1 in S. */
+0:
+ addl SY, SY, SY
+ addl S, S, S
+$q_low:
+ cmpult SY, R, AT
+ bne AT, 0b
+
+ /* Shift-down and subtract loop. Each iteration compares our scaled
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ Y's scalar (S) so add it to the quotient (Q). */
+2: addl Q, S, t3
+ srl S, 1, S
+ cmpule SY, R, AT
+ subl R, SY, t4
+
+ selne AT, t3, Q, Q
+ selne AT, t4, R, R
+ srl SY, 1, SY
+ bne S, 2b
+
+ br $q_low_ret
+
+ .align 4
+ cfi_restore_state
+$y_is_neg:
+ /* If we get here, Y is so big that bit 63 is set. The results
+ from the divide will be completely wrong. Fortunately, the
+ quotient must be either 0 or 1, so the remainder must be X
+ or X-Y, so just compute it directly. */
+ cmpule Y, X, AT
+ nop
+ wfpcr $f3
+ subl X, Y, RV
+ fldd $f0, 0(sp)
+ fldd $f3, 48(sp)
+ fldd $f4, 56(sp)
+ fldd $f5, 64(sp)
+ seleq AT, X, RV, RV
+
+ ldi sp, FRAME(sp)
+ cfi_restore ($f0)
+ cfi_restore ($f3)
+ cfi_restore ($f4)
+ cfi_restore ($f5)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+ .align 4
+ cfi_def_cfa_offset (FRAME)
+$powerof2:
+ subl Y, 1, AT
+ beq Y, DIVBYZERO
+ and X, AT, RV
+ ldi sp, FRAME(sp)
+ cfi_def_cfa_offset (0)
+ ret $31, (RA), 1
+
+ cfi_endproc
+ .size __remlu, .-__remlu
+
+ DO_DIVBYZERO
diff --git a/sysdeps/sw_64/rshift.S b/sysdeps/sw_64/rshift.S
new file mode 100644
index 00000000..81b3d742
--- /dev/null
+++ b/sysdeps/sw_64/rshift.S
@@ -0,0 +1,105 @@
+ # Sw_64 1621 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 4.8 cycles/limb on the 1621. With infinite unrolling,
+ # it would take 4 cycles/limb. It should be possible to get down to 3
+ # cycles/limb since both ldl and stl can be paired with the other used
+ # instructions. But there are many restrictions in the 1621 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldl $4,0($17) # load first limb
+ addl $17,8,$17
+ subl $31,$19,$7
+ subl $18,1,$18
+ and $18,4-1,$20 # number of limbs in first loop
+ sll $4,$7,$0 # compute function result
+
+ beq $20,.L0
+ subl $18,$20,$18
+
+ .align 3
+.Loop0:
+ ldl $3,0($17)
+ addl $16,8,$16
+ addl $17,8,$17
+ subl $20,1,$20
+ srl $4,$19,$5
+ sll $3,$7,$6
+ bis $3,$3,$4
+ bis $5,$6,$8
+ stl $8,-8($16)
+ bne $20,.Loop0
+
+.L0: beq $18,.Lend
+
+ .align 3
+.Loop: ldl $3,0($17)
+ addl $16,32,$16
+ subl $18,4,$18
+ srl $4,$19,$5
+ sll $3,$7,$6
+
+ ldl $4,8($17)
+ srl $3,$19,$1
+ bis $5,$6,$8
+ stl $8,-32($16)
+ sll $4,$7,$2
+
+ ldl $3,16($17)
+ srl $4,$19,$5
+ bis $1,$2,$8
+ stl $8,-24($16)
+ sll $3,$7,$6
+
+ ldl $4,24($17)
+ srl $3,$19,$1
+ bis $5,$6,$8
+ stl $8,-16($16)
+ sll $4,$7,$2
+
+ addl $17,32,$17
+ bis $1,$2,$8
+ stl $8,-8($16)
+
+ bgt $18,.Loop
+
+.Lend: srl $4,$19,$8
+ stl $8,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/sysdeps/sw_64/sub_n.S b/sysdeps/sw_64/sub_n.S
new file mode 100644
index 00000000..d0d5a30c
--- /dev/null
+++ b/sysdeps/sw_64/sub_n.S
@@ -0,0 +1,118 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ ldl $3,0($17)
+ ldl $4,0($18)
+
+ subl $19,1,$19
+ and $19,4-1,$2 # number of limbs in first loop
+ bis $31,$31,$0
+ beq $2,.L0 # if fmuldiple of 4 limbs, skip first loop
+
+ subl $19,$2,$19
+
+.Loop0: subl $2,1,$2
+ ldl $5,8($17)
+ addl $4,$0,$4
+ ldl $6,8($18)
+ cmpult $4,$0,$1
+ subl $3,$4,$4
+ cmpult $3,$4,$0
+ stl $4,0($16)
+ or $0,$1,$0
+
+ addl $17,8,$17
+ addl $18,8,$18
+ bis $5,$5,$3
+ bis $6,$6,$4
+ addl $16,8,$16
+ bne $2,.Loop0
+
+.L0: beq $19,.Lend
+
+ .align 3
+.Loop: subl $19,4,$19
+
+ ldl $5,8($17)
+ addl $4,$0,$4
+ ldl $6,8($18)
+ cmpult $4,$0,$1
+ subl $3,$4,$4
+ cmpult $3,$4,$0
+ stl $4,0($16)
+ or $0,$1,$0
+
+ ldl $3,16($17)
+ addl $6,$0,$6
+ ldl $4,16($18)
+ cmpult $6,$0,$1
+ subl $5,$6,$6
+ cmpult $5,$6,$0
+ stl $6,8($16)
+ or $0,$1,$0
+
+ ldl $5,24($17)
+ addl $4,$0,$4
+ ldl $6,24($18)
+ cmpult $4,$0,$1
+ subl $3,$4,$4
+ cmpult $3,$4,$0
+ stl $4,16($16)
+ or $0,$1,$0
+
+ ldl $3,32($17)
+ addl $6,$0,$6
+ ldl $4,32($18)
+ cmpult $6,$0,$1
+ subl $5,$6,$6
+ cmpult $5,$6,$0
+ stl $6,24($16)
+ or $0,$1,$0
+
+ addl $17,32,$17
+ addl $18,32,$18
+ addl $16,32,$16
+ bne $19,.Loop
+
+.Lend: addl $4,$0,$4
+ cmpult $4,$0,$1
+ subl $3,$4,$4
+ cmpult $3,$4,$0
+ stl $4,0($16)
+ or $0,$1,$0
+ ret $31,($26),1
+
+ .end __mpn_sub_n
diff --git a/sysdeps/sw_64/submul_1.S b/sysdeps/sw_64/submul_1.S
new file mode 100644
index 00000000..2cad2bef
--- /dev/null
+++ b/sysdeps/sw_64/submul_1.S
@@ -0,0 +1,89 @@
+ # Sw_64 1621 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # fsubdract the result from a second limb vector.
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # s2_limb r19
+
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_submul_1
+ .ent __mpn_submul_1 2
+__mpn_submul_1:
+ .frame $30,0,$26
+
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ subl $18,1,$18 # size--
+ mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ umulh $2,$19,$0 # $0 = prod_high
+ beq $18,.Lend1 # jump if size was == 1
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ subl $18,1,$18 # size--
+ subl $5,$3,$3
+ cmpult $5,$3,$4
+ stl $3,0($16)
+ addl $16,8,$16 # res_ptr++
+ beq $18,.Lend2 # jump if size was == 2
+
+ .align 3
+.Loop: mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ subl $18,1,$18 # size--
+ umulh $2,$19,$4 # $4 = cy_limb
+ ldl $2,0($17) # $2 = s1_limb
+ addl $17,8,$17 # s1_ptr++
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ subl $5,$3,$3
+ cmpult $5,$3,$5
+ stl $3,0($16)
+ addl $16,8,$16 # res_ptr++
+ addl $5,$0,$0 # combine carries
+ bne $18,.Loop
+
+.Lend2: mull $2,$19,$3 # $3 = prod_low
+ ldl $5,0($16) # $5 = *res_ptr
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
+ umulh $2,$19,$4 # $4 = cy_limb
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
+ subl $5,$3,$3
+ cmpult $5,$3,$5
+ stl $3,0($16)
+ addl $5,$0,$0 # combine carries
+ addl $4,$0,$0 # cy_limb = prod_high + cy
+ ret $31,($26),1
+.Lend1: subl $5,$3,$3
+ cmpult $5,$3,$5
+ stl $3,0($16)
+ addl $0,$5,$0
+ ret $31,($26),1
+
+ .end __mpn_submul_1
diff --git a/sysdeps/sw_64/sw6a/add_n.S b/sysdeps/sw_64/sw6a/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ addl $0,$4,$20 # 1st main add
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ addl $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ addl $28,$6,$22 # 3rd main add
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ addl $4,$28,$20 # 1st main add
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ addl $5,$28,$21 # 2nd main add
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ addl $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ addl $4,$28,$20 # main add
+ ldl $4,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ addl $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_add_n
diff --git a/sysdeps/sw_64/sw6a/addmul_1.S b/sysdeps/sw_64/sw6a/addmul_1.S
new file mode 100644
index 00000000..287e8573
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ #
+ # This code was written in close cooperation with pipeline expert
+ # . Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldl in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
+ # like not to have a ldl or stl to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ ldi $30, -240($30)
+ stl $9, 8($30)
+ stl $10, 16($30)
+ stl $11, 24($30)
+ stl $12, 32($30)
+ stl $13, 40($30)
+ stl $14, 48($30)
+ stl $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $0, $5, $0
+
+$Lunroll:
+ ldi $17, -16($17) # L1 bookkeeping
+ ldi $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldl $2, 16($17) # L1
+ ldl $3, 24($17) # L1
+ ldi $18, -1($18) # L1 bookkeeping
+ ldl $6, 16($16) # L1
+ ldl $7, 24($16) # L1
+ ldl $0, 32($17) # L1
+ mull $19, $2, $13 # U1
+ ldl $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mull $19, $3, $15 # U1
+ ldi $17, 64($17) # L1 bookkeeping
+ ldl $4, 32($16) # L1
+ ldl $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldl $2, -16($17) # L1
+ mull $19, $0, $9 # U1
+ ldl $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ mull $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+ mull $19, $3, $15 # U1
+ addl $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, 16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $18, -1($18) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 32($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $17, 64($17) # L1 bookkeeping
+ addl $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, -16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+ mull $19, $1, $11 # U1
+ addl $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+ addl $12, $21, $0 # U0 hi mul + carry
+
+ ldl $9, 8($30)
+ ldl $10, 16($30)
+ ldl $11, 24($30)
+ ldl $12, 32($30)
+ ldl $13, 40($30)
+ ldl $14, 48($30)
+ ldl $15, 56($30)
+ ldi $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/sw_64/sw6a/lshift.S b/sysdeps/sw_64/sw6a/lshift.S
new file mode 100644
index 00000000..cc00593c
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addl $18,$17,$17 # make r17 point at end of s1
+ ldl $4,-8($17) # load first limb
+ subl $31,$19,$20
+ s8addl $18,$16,$16 # make r16 point at end of RES
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ srl $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,-16($17)
+ subl $16,8,$16
+ sll $4,$19,$5
+ subl $17,8,$17
+ subl $28,1,$28
+ srl $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,0($16)
+ bne $28,.Loop0
+
+.L0: sll $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,-16($17)
+ subl $18,4,$18
+ ldl $2,-24($17)
+ ldl $3,-32($17)
+ ldl $4,-40($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ ldl $1,-48($17)
+ sll $2,$19,$22
+ ldl $2,-56($17)
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ ldl $3,-64($17)
+ sll $4,$19,$24
+ ldl $4,-72($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+
+ srl $1,$20,$7
+ subl $18,4,$18
+ sll $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ srl $2,$20,$8
+ ldl $1,-80($17)
+ sll $2,$19,$22
+ ldl $2,-88($17)
+
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+
+ srl $3,$20,$5
+ unop # ldl $31,-96($17)
+ sll $3,$19,$23
+ subl $16,32,$16
+
+ srl $4,$20,$6
+ ldl $3,-96($17)
+ sll $4,$19,$24
+ ldl $4,-104($17)
+
+ subl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+ srl $3,$20,$5
+ sll $3,$19,$23
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 2/2
+ stl $7,-40($16)
+ or $5,$22,$5
+ stl $8,-48($16)
+ or $6,$23,$6
+ stl $5,-56($16)
+ stl $6,-64($16)
+ # cool down phase 2/3
+ stl $24,-72($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 1/2
+ stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ stl $5,-24($16)
+ stl $6,-32($16)
+ stl $24,-40($16)
+ ret $31,($26),1
+
+.Lend: stl $24,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/sysdeps/sw_64/sw6a/rshift.S b/sysdeps/sw_64/sw6a/rshift.S
new file mode 100644
index 00000000..416c3903
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldl $4,0($17) # load first limb
+ subl $31,$19,$20
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ sll $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,8($17)
+ addl $16,8,$16
+ srl $4,$19,$5
+ addl $17,8,$17
+ subl $28,1,$28
+ sll $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,-8($16)
+ bne $28,.Loop0
+
+.L0: srl $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,8($17)
+ subl $18,4,$18
+ ldl $2,16($17)
+ ldl $3,24($17)
+ ldl $4,32($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ ldl $1,40($17)
+ srl $2,$19,$22
+ ldl $2,48($17)
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ ldl $3,56($17)
+ srl $4,$19,$24
+ ldl $4,64($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+
+ sll $1,$20,$7
+ subl $18,4,$18
+ srl $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ sll $2,$20,$8
+ ldl $1,72($17)
+ srl $2,$19,$22
+ ldl $2,80($17)
+
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+
+ sll $3,$20,$5
+ unop # ldl $31,-96($17)
+ srl $3,$19,$23
+ addl $16,32,$16
+
+ sll $4,$20,$6
+ ldl $3,88($17)
+ srl $4,$19,$24
+ ldl $4,96($17)
+
+ addl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+ sll $3,$20,$5
+ srl $3,$19,$23
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 2/2
+ stl $7,32($16)
+ or $5,$22,$5
+ stl $8,40($16)
+ or $6,$23,$6
+ stl $5,48($16)
+ stl $6,56($16)
+ # cool down phase 2/3
+ stl $24,64($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 1/2
+ stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ stl $5,16($16)
+ stl $6,24($16)
+ stl $24,32($16)
+ ret $31,($26),1
+
+.Lend: stl $24,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/sysdeps/sw_64/sw6a/sub_n.S b/sysdeps/sw_64/sw6a/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw6a/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ subl $4,$0,$20 # 1st main sub
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ subl $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ subl $6,$28,$22 # 3rd main sub
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ subl $4,$28,$20 # 1st main sub
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ subl $5,$28,$21 # 2nd main sub
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ subl $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ subl $4,$28,$20 # main sub
+ ldl $1,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ subl $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
diff --git a/sysdeps/sw_64/sw6b/add_n.S b/sysdeps/sw_64/sw6b/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ addl $0,$4,$20 # 1st main add
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ addl $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ addl $28,$6,$22 # 3rd main add
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ addl $4,$28,$20 # 1st main add
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ addl $5,$28,$21 # 2nd main add
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ addl $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ addl $4,$28,$20 # main add
+ ldl $4,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ addl $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_add_n
diff --git a/sysdeps/sw_64/sw6b/addmul_1.S b/sysdeps/sw_64/sw6b/addmul_1.S
new file mode 100644
index 00000000..a288f040
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ #
+ # This code was written in close cooperation with pipeline expert
+ # . Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldl in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
+ # like not to have a ldl or stl to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ ldi $30, -240($30)
+ stl $9, 8($30)
+ stl $10, 16($30)
+ stl $11, 24($30)
+ stl $12, 32($30)
+ stl $13, 40($30)
+ stl $14, 48($30)
+ stl $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $0, $5, $0
+
+$Lunroll:
+ ldi $17, -16($17) # L1 bookkeeping
+ ldi $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldl $2, 16($17) # L1
+ ldl $3, 24($17) # L1
+ ldi $18, -1($18) # L1 bookkeeping
+ ldl $6, 16($16) # L1
+ ldl $7, 24($16) # L1
+ ldl $0, 32($17) # L1
+ mull $19, $2, $13 # U1
+ ldl $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mull $19, $3, $15 # U1
+ ldi $17, 64($17) # L1 bookkeeping
+ ldl $4, 32($16) # L1
+ ldl $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldl $2, -16($17) # L1
+ mull $19, $0, $9 # U1
+ ldl $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ mull $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+ mull $19, $3, $15 # U1
+ addl $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, 16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $18, -1($18) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 32($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $17, 64($17) # L1 bookkeeping
+ addl $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, -16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+ mull $19, $1, $11 # U1
+ addl $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+ addl $12, $21, $0 # U0 hi mul + carry
+
+ ldl $9, 8($30)
+ ldl $10, 16($30)
+ ldl $11, 24($30)
+ ldl $12, 32($30)
+ ldl $13, 40($30)
+ ldl $14, 48($30)
+ ldl $15, 56($30)
+ ldi $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/sw_64/sw6b/lshift.S b/sysdeps/sw_64/sw6b/lshift.S
new file mode 100644
index 00000000..cc00593c
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addl $18,$17,$17 # make r17 point at end of s1
+ ldl $4,-8($17) # load first limb
+ subl $31,$19,$20
+ s8addl $18,$16,$16 # make r16 point at end of RES
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ srl $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,-16($17)
+ subl $16,8,$16
+ sll $4,$19,$5
+ subl $17,8,$17
+ subl $28,1,$28
+ srl $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,0($16)
+ bne $28,.Loop0
+
+.L0: sll $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,-16($17)
+ subl $18,4,$18
+ ldl $2,-24($17)
+ ldl $3,-32($17)
+ ldl $4,-40($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ ldl $1,-48($17)
+ sll $2,$19,$22
+ ldl $2,-56($17)
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ ldl $3,-64($17)
+ sll $4,$19,$24
+ ldl $4,-72($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+
+ srl $1,$20,$7
+ subl $18,4,$18
+ sll $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ srl $2,$20,$8
+ ldl $1,-80($17)
+ sll $2,$19,$22
+ ldl $2,-88($17)
+
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+
+ srl $3,$20,$5
+ unop # ldl $31,-96($17)
+ sll $3,$19,$23
+ subl $16,32,$16
+
+ srl $4,$20,$6
+ ldl $3,-96($17)
+ sll $4,$19,$24
+ ldl $4,-104($17)
+
+ subl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+ srl $3,$20,$5
+ sll $3,$19,$23
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 2/2
+ stl $7,-40($16)
+ or $5,$22,$5
+ stl $8,-48($16)
+ or $6,$23,$6
+ stl $5,-56($16)
+ stl $6,-64($16)
+ # cool down phase 2/3
+ stl $24,-72($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 1/2
+ stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ stl $5,-24($16)
+ stl $6,-32($16)
+ stl $24,-40($16)
+ ret $31,($26),1
+
+.Lend: stl $24,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/sysdeps/sw_64/sw6b/memcpy.S b/sysdeps/sw_64/sw6b/memcpy.S
new file mode 100644
index 00000000..938ebdfc
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/memcpy.S
@@ -0,0 +1,416 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ sw6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/*
+ * Much of the information about 21264 scheduling/coding comes from:
+ * Compiler Writer's Guide for the Sw_64 21264
+ * abbreviated as 'CWG' in other comments here
+ * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ * E - either cluster
+ * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ *
+ * Temp usage notes:
+ * $0 - destination address
+ * $1,$2, - scratch
+ */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noreorder
+ .set noat
+
+ .type $jmppointh,@object
+$jumppointh:
+ .gprel32 $both_0mod8
+ .gprel32 J$H01
+ .gprel32 J$H02
+ .gprel32 J$H03
+ .gprel32 J$H04
+ .gprel32 J$H05
+ .gprel32 J$H06
+ .gprel32 J$H07
+
+ENTRY(memcpy)
+ .prologue 1
+ ldgp $29, 0($27)
+ mov $16, $0 # E : copy dest to return
+ ble $18, $nomoredata # U : done with the copy?
+ cmplt $18, 8, $1
+ bne $1, $less_8
+ xor $16, $17, $1 # E : are source and dest alignments the same?
+ and $1, 7, $1 # E : are they the same mod 8?
+
+ bne $1, $misaligned # U : Nope - gotta do this the slow way
+ /* source and dest are same mod 8 address */
+ and $16, 7, $1 # E : Are both 0mod8?
+ beq $1, $both_0mod8 # U : Yes
+ nop # E :
+
+ /*
+ * source and dest are same misalignment. move a byte at a time
+ * until a 0mod8 alignment for both is reached.
+ * At least one byte more to move
+ */
+
+ ldi $2, 8
+ subl $2, $1, $1
+
+$head_align:
+ addl $16, $1, $16
+ addl $17, $1, $17
+ subl $18, $1, $18
+ ldih $2, $jumppointh($29) !gprelhigh
+ s4addl $1, $2, $2
+ ldw $2, $jumppointh($2) !gprellow
+ addl $2, $29, $2
+ jmp ($2)
+
+$both_0mod8:
+ cmple $18, 127, $1 # E : Can we unroll the loop?
+ bne $1, $no_unroll # U :
+ and $16, 63, $1 # E : get mod64 alignment
+ beq $1, $do_unroll # U : no single quads to fiddle
+
+$single_head_quad:
+ ldl $1, 0($17) # L : get 8 bytes
+ subl $18, 8, $18 # E : count -= 8
+ addl $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stl $1, 0($16) # L : store
+ addl $16, 8, $16 # E : dest += 8
+ and $16, 63, $1 # E : get mod64 alignment
+ bne $1, $single_head_quad # U : still not fully aligned
+
+$do_unroll:
+ ldih $1, 8($31) # big than 512K
+ cmple $18, $1, $1
+ beq $1, $unroll_body_512
+ nop
+ nop
+ cmple $18, 63, $1 # E : Can we go through the unrolled loop?
+ bne $1, $tail_quads # U : Nope
+ nop # E :
+
+$unroll_body:
+ ldl $6, 0($17) # L0 : bytes 0..7
+ nop # E :
+ nop # E :
+
+ ldl $4, 8($17) # L : bytes 8..15
+ ldl $5, 16($17) # L : bytes 16..23
+ nop # E :
+ nop # E :
+
+ ldl $3, 24($17) # L : bytes 24..31
+ addl $16, 64, $1 # E : fallback value for wh64
+ nop # E :
+ nop # E :
+
+ addl $17, 32, $17 # E : src += 32 bytes
+ stl $6, 0($16) # L : bytes 0..7
+ nop # E :
+ nop # E :
+
+ stl $4, 8($16) # L : bytes 8..15
+ stl $5, 16($16) # L : bytes 16..23
+ subl $18, 192, $2 # E : At least two more trips to go?
+ nop # E :
+
+ stl $3, 24($16) # L : bytes 24..31
+ addl $16, 32, $16 # E : dest += 32 bytes
+ nop # E :
+ nop # E :
+
+ ldl $6, 0($17) # L : bytes 0..7
+ ldl $4, 8($17) # L : bytes 8..15
+ # fallback wh64 address if < 2 more trips
+ nop # E :
+ nop # E :
+
+ ldl $5, 16($17) # L : bytes 16..23
+ ldl $3, 24($17) # L : bytes 24..31
+ addl $16, 32, $16 # E : dest += 32
+ subl $18, 64, $18 # E : count -= 64
+
+ addl $17, 32, $17 # E : src += 32
+ stl $6, -32($16) # L : bytes 0..7
+ stl $4, -24($16) # L : bytes 8..15
+ cmple $18, 63, $1 # E : At least one more trip?
+
+ stl $5, -16($16) # L : bytes 16..23
+ stl $3, -8($16) # L : bytes 24..31
+ nop # E :
+ beq $1, $unroll_body
+ nop
+ nop
+ nop
+ br $tail_quads
+
+$unroll_body_512:
+ fillcs 128*4($17)
+ e_fillcs 128*20($17)
+
+ fillcs 128*3($16) #add by ZJ20220620 stl_nc->stl
+ e_fillcs 128*7($16)
+
+ ldl $6, 0($17) # L0 : bytes 0..7
+ nop # E :
+ nop # E :
+
+ ldl $4, 8($17) # L : bytes 8..15
+ ldl $5, 16($17) # L : bytes 16..23
+ nop # E :
+ nop # E :
+
+ ldl $3, 24($17) # L : bytes 24..31
+ addl $16, 64, $1 # E : fallback value for wh64
+ nop # E :
+ nop # E :
+
+ addl $17, 32, $17 # E : src += 32 bytes
+ stl $6, 0($16) # L : bytes 0..7
+ nop # E :
+ nop # E :
+
+ stl $4, 8($16) # L : bytes 8..15
+ stl $5, 16($16) # L : bytes 16..23
+ subl $18, 192, $2 # E : At least two more trips to go?
+ nop # E :
+
+ stl $3, 24($16) # L : bytes 24..31
+ addl $16, 32, $16 # E : dest += 32 bytes
+ nop # E :
+ nop # E :
+
+ ldl $6, 0($17) # L : bytes 0..7
+ ldl $4, 8($17) # L : bytes 8..15
+ # fallback wh64 address if < 2 more trips
+ nop # E :
+ nop # E :
+
+ ldl $5, 16($17) # L : bytes 16..23
+ ldl $3, 24($17) # L : bytes 24..31
+ addl $16, 32, $16 # E : dest += 32
+ subl $18, 64, $18 # E : count -= 64
+
+ addl $17, 32, $17 # E : src += 32
+ stl $6, -32($16) # L : bytes 0..7
+ stl $4, -24($16) # L : bytes 8..15
+ cmple $18, 63, $1 # E : At least one more trip?
+
+ stl $5, -16($16) # L : bytes 16..23
+ stl $3, -8($16) # L : bytes 24..31
+ nop # E :
+ beq $1, $unroll_body_512
+
+$tail_quads:
+$no_unroll:
+ .align 4
+ subl $18, 8, $18 # E : At least a quad left?
+ blt $18, $less_than_8 # U : Nope
+ nop # E :
+ nop # E :
+
+$move_a_quad:
+ ldl $1, 0($17) # L : fetch 8
+ subl $18, 8, $18 # E : count -= 8
+ addl $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stl $1, 0($16) # L : store 8
+ addl $16, 8, $16 # E : dest += 8
+ bge $18, $move_a_quad # U :
+ nop # E :
+
+$less_than_8:
+ .align 4
+ addl $18, 8, $18 # E : add back for trailing bytes
+ ble $18, $nomoredata # U : All-done
+ nop # E :
+ nop # E :
+
+ /* Trailing bytes */
+$tail_bytes:
+ subl $18, 1, $18 # E : count--
+ ldbu $1, 0($17) # L : fetch a byte
+ addl $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($16) # L : store a byte
+ addl $16, 1, $16 # E : dest++
+ bgt $18, $tail_bytes # U : more to be done?
+ nop # E :
+
+ /* branching to exit takes 3 extra cycles, so replicate exit here */
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+$misaligned:
+ mov $0, $4 # E : dest temp
+ and $0, 7, $1 # E : dest alignment mod8
+ beq $1, $dest_0mod8 # U : life doesnt totally suck
+ nop
+
+$aligndest:
+ ble $18, $nomoredata # U :
+ ldbu $1, 0($17) # L : fetch a byte
+ subl $18, 1, $18 # E : count--
+ addl $17, 1, $17 # E : src++
+
+ stb $1, 0($4) # L : store it
+ addl $4, 1, $4 # E : dest++
+ and $4, 7, $1 # E : dest 0mod8 yet?
+ bne $1, $aligndest # U : go until we are aligned.
+
+ /* Source has unknown alignment, but dest is known to be 0mod8 */
+$dest_0mod8:
+ subl $18, 8, $18 # E : At least a quad left?
+ blt $18, $misalign_tail # U : Nope
+ ldl_u $3, 0($17) # L : seed (rotating load) of 8 bytes
+ ldih $1, 8($31)
+ subl $1, 8, $1
+ cmple $18, $1, $1
+ beq $1, $mis_quad_big # big than 512K
+
+$mis_quad:
+ ldl_u $16, 8($17) # L : Fetch next 8
+ ext3b $3, $17, $3 # U : masking
+ ext7b $16, $17, $1 # U : masking
+ bis $3, $1, $1 # E : merged bytes to store
+
+ subl $18, 8, $18 # E : count -= 8
+ addl $17, 8, $17 # E : src += 8
+ stl $1, 0($4) # L : store 8 (aligned)
+ mov $16, $3 # E : "rotate" source data
+
+ addl $4, 8, $4 # E : dest += 8
+ bge $18, $mis_quad # U : More quads to move
+ nop
+ nop
+ nop
+ br $misalign_tail
+
+$mis_quad_big:
+ fillcs 128*4($17)
+ e_fillcs 128*20($17)
+ ldl_u $16, 8($17) # L : Fetch next 8
+ ext3b $3, $17, $3 # U : masking
+ ext7b $16, $17, $1 # U : masking
+ bis $3, $1, $1 # E : merged bytes to store
+
+ fillcs 128*9($17) #add by ZJ20220620 stl_nc->stl
+ e_fillcs 128*15($17)
+
+ subl $18, 8, $18 # E : count -= 8
+ addl $17, 8, $17 # E : src += 8
+ stl $1, 0($4) # L : store 8 (aligned)
+ mov $16, $3 # E : "rotate" source data
+
+ addl $4, 8, $4 # E : dest += 8
+ bge $18, $mis_quad_big # U : More quads to move
+ nop
+ nop
+
+$misalign_tail:
+ addl $18, 8, $18 # E : account for tail stuff
+ ble $18, $nomoredata # U :
+ nop
+ nop
+
+$misalign_byte:
+ ldbu $1, 0($17) # L : fetch 1
+ subl $18, 1, $18 # E : count--
+ addl $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($4) # L : store
+ addl $4, 1, $4 # E : dest++
+ bgt $18, $misalign_byte # U : more to go?
+ nop
+ br $nomoredata
+
+$less_8:
+ ldbu $1, 0($17) # L : fetch 1
+ subl $18, 1, $18 # E : count--
+ addl $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($16) # L : store
+ addl $16, 1, $16 # E : dest++
+ bgt $18, $less_8 # U : more to go?
+ nop
+
+$nomoredata:
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+J$H01:
+ ldbu $1,-1($17)
+ stb $1,-1($16)
+ br $both_0mod8
+
+J$H02:
+ ldh $1,-2($17)
+ sth $1,-2($16)
+ br $both_0mod8
+
+J$H03:
+ ldh $1,-2($17)
+ ldbu $2,-3($17)
+ sth $1,-2($16)
+ stb $2,-3($16)
+ br $both_0mod8
+
+J$H04:
+ ldw $1,-4($17)
+ stw $1,-4($16)
+ br $both_0mod8
+
+J$H05:
+ ldw $1,-4($17)
+ ldbu $2,-5($17)
+ stw $1,-4($16)
+ stb $2,-5($16)
+ br $both_0mod8
+
+J$H06:
+ ldw $1,-4($17)
+ ldh $2,-6($17)
+ stw $1,-4($16)
+ sth $2,-6($16)
+ br $both_0mod8
+
+J$H07:
+ ldw $1,-4($17)
+ ldh $2,-6($17)
+ ldbu $3,-7($17)
+ stw $1,-4($16)
+ sth $2,-6($16)
+ stb $3,-7($16)
+ br $both_0mod8
+
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/sw_64/sw6b/memset.S b/sysdeps/sw_64/sw6b/memset.S
new file mode 100644
index 00000000..0085ac70
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/memset.S
@@ -0,0 +1,312 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+ .arch sw6b
+ .set noat
+ .set noreorder
+
+ENTRY(memset)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ /*
+ * Serious stalling happens. The only way to mitigate this is to
+ * undertake a major re-write to interleave the constant materialization
+ * with other parts of the fall-through code. This is important, even
+ * though it makes maintenance tougher.
+ * Do this later.
+ */
+ and $17, 255, $1 # E : 00000000000000ch
+ ins0b $17, 1, $2 # U : 000000000000ch00
+ mov $16, $0 # E : return value
+ mov $17, $8 # E : Save the ch
+ ble $18, $end # U : zero length requested?
+
+ addl $18, $16, $6 # E : max address to write to
+ or $1, $2, $17 # E : 000000000000chch
+ ins0b $1, 2, $3 # U : 0000000000ch0000
+ ins0b $1, 3, $4 # U : 00000000ch000000
+
+ or $3, $4, $3 # E : 00000000chch0000
+ ins1b $17, 4, $5 # U : 0000chch00000000
+ xor $16, $6, $1 # E : will complete write be within one quadword?
+ ins1b $17, 6, $2 # U : chch000000000000
+
+ or $17, $3, $17 # E : 00000000chchchch
+ or $2, $5, $2 # E : chchchch00000000
+ bic $1, 7, $1 # E : fit within a single quadword?
+ and $16, 7, $3 # E : Target addr misalignment
+
+ or $17, $2, $17 # E : chchchchchchchch
+ beq $1, $within_quad # U :
+ nop # E :
+ beq $3, $aligned # U : target is 0mod8
+
+ /*
+ * Target address is misaligned, and won't fit within a quadword.
+ */
+
+#ifdef pixman_error
+ /* if the addr is unaligned in multi-thread, this will cause thread
+ unsafty,so use stb to store the trailing bytes. */
+ ldl_u $4, 0($16) # L : Fetch first partial
+ mov $16, $5 # E : Save the address
+ ins3b $17, $16, $2 # U : Insert new bytes
+ subl $3, 8, $3 # E : Invert (for addressing uses)
+
+ addl $18, $3, $18 # E : $18 is new count ($3 is negative)
+ mask3b $4, $16, $4 # U : clear relevant parts of the quad
+ subl $16, $3, $16 # E : $16 is new aligned destination
+ or $2, $4, $1 # E : Final bytes
+
+ nop
+ stl_u $1,0($5) # L : Store result
+ nop
+ nop
+#else
+$misaligned:
+ stb $8, 0($16)
+ subl $18, 1, $18
+ beq $18, $end
+ addl $16, 1, $16
+ and $16, 7, $3 # E : Target addr misalignment
+ bne $3, $misaligned
+#endif
+
+ .align 4
+$aligned:
+ /*
+ * We are now guaranteed to be quad aligned, with at least
+ * one partial quad to write.
+ */
+
+ sra $18, 3, $3 # U : Number of remaining quads to write
+ and $18, 7, $18 # E : Number of trailing bytes to write
+ mov $16, $5 # E : Save dest address
+ beq $3, $no_quad # U : tail stuff only
+
+ /*
+ * It's worth the effort to unroll this and use wh64 if possible.
+ * At this point, entry values are:
+ * $16 Current destination address
+ * $5 A copy of $16
+ * $6 The max quadword address to write to
+ * $18 Number trailer bytes
+ * $3 Number quads to write
+ */
+# and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
+ and $16, 0x1f, $2 # E : Forward work (only useful for unrolled loop)
+ subl $3, 16, $4 # E : Only try to unroll if > 128 bytes
+ subl $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
+ blt $4, $loop # U :
+
+ /*
+ * We know we've got at least 16 quads, minimum of one trip
+ * through unrolled loop. Do a quad at a time to get us 0mod64
+ * aligned.
+ */
+
+ nop # E :
+ nop # E :
+ nop # E :
+# beq $1, $bigalign # U :
+ beq $2, $bigalign # U :
+$alignmod32:
+ stl $17, 0($5) # L :
+ subl $3, 1, $3 # E : For consistency later
+ addl $1, 8, $1 # E : Increment towards zero for alignment
+# addl $5, 8, $4 # E : Initial wh64 address (filler instruction)
+
+ nop
+ nop
+ addl $5, 8, $5 # E : Inc address
+ blt $1, $alignmod32 # U :
+
+
+$bigalign:
+ ldih $1, 8($31) # big than 512KB
+ cmple $18, $1, $1
+ beq $1, $do_wh64_512
+
+ /*
+ * $3 - number quads left to go
+ * $5 - target address (aligned 0mod64)
+ * $17 - mask of stuff to store
+ * Scratch registers available: $7, $2, $4, $1
+ * We know that we'll be taking a minimum of one trip through.
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+ * Assumes the wh64 needs to be for 2 trips through the loop in the
+ * future.The wh64 is issued on for the starting destination address for
+ * trip +2 through the loop, and if there are less than two trips left,
+ * the target address will be for the current trip. */
+
+$do_wh64:
+# wh64 ($4) # L1 : memory subsystem write hint
+ subl $3, 24, $2 # E : For determining future wh64 addresses
+ stl $17, 0($5) # L :
+ nop # E :
+
+# addl $5, 128, $4 # E : speculative target of next wh64
+ stl $17, 8($5) # L :
+ stl $17, 16($5) # L :
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
+
+ stl $17, 24($5) # L :
+ stl $17, 32($5) # L :
+# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
+ nop
+
+ stl $17, 40($5) # L :
+ stl $17, 48($5) # L :
+ subl $3, 16, $2 # E : Repeat the loop at least once more?
+ nop
+
+ stl $17, 56($5) # L :
+ addl $5, 64, $5 # E :
+ subl $3, 8, $3 # E :
+ bge $2, $do_wh64 # U :
+
+ nop
+ nop
+ nop
+ beq $3, $no_quad # U : Might have finished already
+
+ nop
+ nop
+ nop
+ br $loop # U : Might have finished already
+
+$do_wh64_512:
+# wh64 ($4) # L1 : memory subsystem write hint
+ subl $3, 24, $2 # E : For determining future wh64 addresses
+
+ fillcs 128*1($5)
+ e_fillcs 128*5($5)
+
+# stl_nc $17, 0($5) # L :
+ stl $17, 0($5) # L :
+ nop # E :
+
+# addl $5, 128, $4 # E : speculative target of next wh64
+# stl_nc $17, 8($5) # L :
+ stl $17, 8($5) # L :
+# stl_nc $17, 16($5) # L :
+ stl $17, 16($5) # L :
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
+
+# stl_nc $17, 24($5) # L :
+ stl $17, 24($5) # L :
+# stl_nc $17, 32($5) # L :
+ stl $17, 32($5) # L :
+# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
+ nop
+
+# stl_nc $17, 40($5) # L :
+ stl $17, 40($5) # L :
+# stl_nc $17, 48($5) # L :
+ stl $17, 48($5) # L :
+ subl $3, 16, $2 # E : Repeat the loop at least once more?
+ nop
+
+# stl_nc $17, 56($5) # L :
+ stl $17, 56($5) # L :
+ addl $5, 64, $5 # E :
+ subl $3, 8, $3 # E :
+ bge $2, $do_wh64_512 # U :
+
+ nop
+ nop
+ nop
+ beq $3, $no_quad # U : Might have finished already
+
+ .align 4
+ /*
+ * Simple loop for trailing quadwords, or for small amounts
+ * of data (where we can't use an unrolled loop and wh64)
+ */
+$loop:
+ stl $17, 0($5) # L :
+ subl $3, 1, $3 # E : Decrement number quads left
+ addl $5, 8, $5 # E : Inc address
+ bne $3, $loop # U : more?
+
+$no_quad:
+ /*
+ * Write 0..7 trailing bytes.
+ */
+ nop # E :
+ beq $18, $end # U : All done?
+
+#ifndef pixman_error
+/* if the addr is unaligned in multi-thread, this will cause thread unsafty,
+ so use stb to store the trailing bytes. */
+$trailing:
+ stb $17, 0($5)
+ subl $18, 1, $18
+ beq $18, $end
+ addl $5, 1, $5
+ br $trailing
+#else
+ ldl $7, 0($5) # L :
+ mask7b $7, $6, $2 # U : Mask final quad
+
+ ins7b $17, $6, $4 # U : New bits
+ or $2, $4, $1 # E : Put it all together
+ stl $1, 0($5) # L : And back to memory
+ ret $31,($26),1 # L0 :
+#endif
+
+$within_quad:
+#ifdef PIXMAN_ERROR
+ /* if the addr is unaligned in multi-thread, this will cause thread
+ unsafty,so use stb to store the trailing bytes. */
+ ldl_u $1, 0($16) # L :
+ ins3b $17, $16, $2 # U : New bits
+ mask3b $1, $16, $4 # U : Clear old
+ or $2, $4, $2 # E : New result
+
+ mask3b $2, $6, $4 # U :
+ mask7b $1, $6, $2 # U :
+ or $2, $4, $1 # E :
+ stl_u $1, 0($16) # L :
+#else
+ stb $8, 0($16)
+ subl $18, 1, $18
+ beq $18, $end
+ addl $16, 1, $16
+ br $within_quad
+#endif
+
+$end:
+ nop
+ nop
+ nop
+ ret $31,($26),1 # L0 :
+
+ END(memset)
+libc_hidden_builtin_def (memset)
diff --git a/sysdeps/sw_64/sw6b/rshift.S b/sysdeps/sw_64/sw6b/rshift.S
new file mode 100644
index 00000000..ec2a78b0
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldl $4,0($17) # load first limb
+ subl $31,$19,$20
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ sll $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,8($17)
+ addl $16,8,$16
+ srl $4,$19,$5
+ addl $17,8,$17
+ subl $28,1,$28
+ sll $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,-8($16)
+ bne $28,.Loop0
+
+.L0: srl $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,8($17)
+ subl $18,4,$18
+ ldl $2,16($17)
+ ldl $3,24($17)
+ ldl $4,32($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ ldl $1,40($17)
+ srl $2,$19,$22
+ ldl $2,48($17)
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ ldl $3,56($17)
+ srl $4,$19,$24
+ ldl $4,64($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+
+ sll $1,$20,$7
+ subl $18,4,$18
+ srl $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ sll $2,$20,$8
+ ldl $1,72($17)
+ srl $2,$19,$22
+ ldl $2,80($17)
+
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+
+ sll $3,$20,$5
+ unop # ldl $31,-96($17)
+ srl $3,$19,$23
+ addl $16,32,$16
+
+ sll $4,$20,$6
+ ldl $3,88($17)
+ srl $4,$19,$24
+ ldl $4,96($17)
+
+ addl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+ sll $3,$20,$5
+ srl $3,$19,$23
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 2/2
+ stl $7,32($16)
+ or $5,$22,$5
+ stl $8,40($16)
+ or $6,$23,$6
+ stl $5,48($16)
+ stl $6,56($16)
+ # cool down phase 2/3
+ stl $24,64($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 1/2
+ stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ stl $5,16($16)
+ stl $6,24($16)
+ stl $24,32($16)
+ ret $31,($26),1
+
+.Lend: stl $24,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/sysdeps/sw_64/sw6b/stxcpy.S b/sysdeps/sw_64/sw6b/stxcpy.S
new file mode 100644
index 00000000..cf07eb8e
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/stxcpy.S
@@ -0,0 +1,314 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Copy a null-terminated string from SRC to DST.
+
+ This is an internal routine used by strcpy, stpcpy, and strcat.
+ As such, it uses special linkage conventions to make implementation
+ of these public functions more efficient.
+
+ On input:
+ t9 = return address
+ a0 = DST
+ a1 = SRC
+
+ On output:
+ t8 = bitmask (with one bit set) indicating the last byte written
+ a0 = unaligned address of the last *word* written
+
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+ .text
+ .type __stxcpy, @function
+ .globl __stxcpy
+ .usepv __stxcpy, no
+
+ cfi_startproc
+ cfi_return_column (t9)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == the first source word. */
+ .align 4
+stxcpy_aligned:
+ /* Create the 1st output word and detect 0's in the 1st input word. */
+ ldi t2, -1 # E : build a mask against false zero
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
+ mask7b t1, a1, t3 # U :
+ ornot t1, t2, t2 # E : (stall)
+
+ mask3b t0, a1, t0 # U : assemble the first output word
+ cmpgeb zero, t2, t10 # E : bits set iff null found
+ or t0, t3, t1 # E : (stall)
+ bne t10, $a_eos # U : (stall)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == a source word not containing a null. */
+ /* Nops here to separate store quads from load quads */
+
+$a_loop:
+ stl_u t1, 0(a0) # L :
+ addl a0, 8, a0 # E :
+ nop
+ nop
+
+ ldl_u t1, 0(a1) # L : Latency=3
+ addl a1, 8, a1 # E :
+ cmpgeb zero, t1, t10 # E : (3 cycle stall)
+ beq t10, $a_loop # U : (stall for t10)
+
+ /* Take care of the final (partial) word store.
+ On entry to this basic block we have:
+ t1 == the source word containing the null
+ t10 == the cmpgeb mask that found it. */
+$a_eos:
+ negl t10, t6 # E : find low bit set
+ and t10, t6, t8 # E : (stall)
+ /* For the sake of the cache, don't read a destination word
+ if we're not going to need it. */
+ and t8, 0x80, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ /* We're doing a partial word store and so need to combine
+ our source and original destination words. */
+ ldl_u t0, 0(a0) # L : Latency=3
+ subl t8, 1, t6 # E :
+ zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
+ or t8, t6, t10 # E : (stall)
+
+ zap t0, t10, t0 # E : clear dst bytes <= null
+ or t0, t1, t1 # E : (stall)
+ nop
+ nop
+
+1: stl_u t1, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ .align 4
+__stxcpy:
+ /* Are source and destination co-aligned? */
+ xor a0, a1, t0 # E :
+ unop # E :
+ and t0, 7, t0 # E : (stall)
+ bne t0, $unaligned # U : (stall)
+
+ /* We are co-aligned; take care of a partial first word. */
+ ldl_u t1, 0(a1) # L : load first src word
+ and a0, 7, t0 # E : take care not to load a word ...
+ addl a1, 8, a1 # E :
+ beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
+
+ ldl_u t0, 0(a0) # L :
+ br stxcpy_aligned # L0 : Latency=3
+ nop
+ nop
+
+
+/* The source and destination are not co-aligned. Align the destination
+ and cope. We have to be very careful about not reading too much and
+ causing a SEGV. */
+
+ .align 4
+$u_head:
+ /* We know just enough now to be able to assemble the first
+ full source word. We can still find a zero at the end of it
+ that prevents us from outputting the whole thing.
+
+ On entry to this basic block:
+ t0 == the first dest word, for masking back in, if needed else 0
+ t1 == the low bits of the first source word
+ t6 == bytemask that is -1 in dest word bytes */
+
+ ldl_u t2, 8(a1) # L :
+ addl a1, 8, a1 # E :
+ ext3b t1, a1, t1 # U : (stall on a1)
+ ext7b t2, a1, t4 # U : (stall on a1)
+
+ mask3b t0, a0, t0 # U :
+ or t1, t4, t1 # E :
+ mask7b t1, a0, t1 # U : (stall on t1)
+ or t0, t1, t1 # E : (stall on t1)
+
+ or t1, t6, t6 # E :
+ cmpgeb zero, t6, t10 # E : (stall)
+ ldi t6, -1 # E : for masking just below
+ bne t10, $u_final # U : (stall)
+
+ mask3b t6, a1, t6 # U : mask out the bits we have
+ or t6, t2, t2 # E : already extracted before (stall)
+ cmpgeb zero, t2, t10 # E : testing eos (stall)
+ bne t10, $u_late_head_exit # U : (stall)
+
+ /* Finally, we've got all the stupid leading edge cases taken care
+ of and we can set up to enter the main loop. */
+
+ stl_u t1, 0(a0) # L : store first output word
+ addl a0, 8, a0 # E :
+ ext3b t2, a1, t0 # U : position ho-bits of lo word
+ ldl_u t2, 8(a1) # U : read next high-order source word
+
+ addl a1, 8, a1 # E :
+ cmpgeb zero, t2, t10 # E : (stall for t2)
+ nop # E :
+ bne t10, $u_eos # U : (stall)
+
+ /* Unaligned copy main loop. In order to avoid reading too much,
+ the loop is structured to detect zeros in aligned source words.
+ This has, unfortunately, effectively pulled half of a loop
+ iteration out into the head and half into the tail, but it does
+ prevent nastiness from accumulating in the very thing we want
+ to run as fast as possible.
+
+ On entry to this basic block:
+ t0 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word
+
+ We further know that t2 does not contain a null terminator. */
+
+ .align 3
+$u_loop:
+ ext7b t2, a1, t1 # U : extract high bits for current word
+ addl a1, 8, a1 # E : (stall)
+ ext3b t2, a1, t3 # U : extract low bits for next time (stall)
+ addl a0, 8, a0 # E :
+
+ or t0, t1, t1 # E : current dst word now complete
+ ldl_u t2, 0(a1) # L : Latency=3 load high word for next time
+ stl_u t1, -8(a0) # L : save the current word (stall)
+ mov t3, t0 # E :
+
+ cmpgeb zero, t2, t10 # E : test new word for eos
+ beq t10, $u_loop # U : (stall)
+ nop
+ nop
+
+ /* We've found a zero somewhere in the source word we just read.
+ If it resides in the lower half, we have one (probably partial)
+ word to write out, and if it resides in the upper half, we
+ have one full and one partial word left to write out.
+
+ On entry to this basic block:
+ t0 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word. */
+$u_eos:
+ ext7b t2, a1, t1 # U :
+ or t0, t1, t1 # E : first (partial) source word complete (stall)
+ cmpgeb zero, t1, t10 # E : is the null in this first bit? (stall)
+ bne t10, $u_final # U : (stall)
+
+$u_late_head_exit:
+ stl_u t1, 0(a0) # L : the null was in the high-order bits
+ addl a0, 8, a0 # E :
+ ext3b t2, a1, t1 # U :
+ cmpgeb zero, t1, t10 # E : (stall)
+
+ /* Take care of a final (probably partial) result word.
+ On entry to this basic block:
+ t1 == assembled source word
+ t10 == cmpgeb mask that found the null. */
+$u_final:
+ negl t10, t6 # E : isolate low bit set
+ and t6, t10, t8 # E : (stall)
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
+ bne t6, 1f # U : (stall)
+
+ ldl_u t0, 0(a0) # E :
+ subl t8, 1, t6 # E :
+ or t6, t8, t10 # E : (stall)
+ zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
+
+ zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
+ or t0, t1, t1 # E : (stall)
+ nop
+ nop
+
+1: stl_u t1, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ /* Unaligned copy entry point. */
+ .align 4
+$unaligned:
+
+ ldl_u t1, 0(a1) # L : load first source word
+ and a0, 7, t4 # E : find dest misalignment
+ and a1, 7, t5 # E : find src misalignment
+ /* Conditionally load the first destination word and a bytemask
+ with 0xff indicating that the destination byte is sacrosanct. */
+ mov zero, t0 # E :
+
+ mov zero, t6 # E :
+ beq t4, 1f # U :
+ ldl_u t0, 0(a0) # L :
+ ldi t6, -1 # E :
+
+ mask3b t6, a0, t6 # U :
+ nop
+ nop
+ nop
+1:
+ subl a1, t4, a1 # E : sub dest misalignment from src addr
+ /* If source misalignment is larger than dest misalignment, we need
+ extra startup checks to avoid SEGV. */
+ cmplt t4, t5, t8 # E :
+ beq t8, $u_head # U :
+ ldi t2, -1 # E : mask out leading garbage in source
+
+ mask7b t2, t5, t2 # U :
+ ornot t1, t2, t3 # E : (stall)
+ cmpgeb zero, t3, t10 # E : is there a zero? (stall)
+ beq t10, $u_head # U : (stall)
+
+ /* At this point we've found a zero in the first partial word of
+ the source. We need to isolate the valid source data and mask
+ it into the original destination data. (Incidentally, we know
+ that we'll need at least one byte of that original dest word.) */
+
+ ldl_u t0, 0(a0) # L :
+ negl t10, t6 # E : build bitmask of bytes <= zero
+ and t6, t10, t8 # E : (stall)
+ and a1, 7, t5 # E :
+
+ subl t8, 1, t6 # E :
+ or t6, t8, t10 # E : (stall)
+ srl t8, t5, t8 # U : adjust final null return value
+ zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
+
+ and t1, t2, t1 # E : to source validity mask
+ ext3b t2, a1, t2 # U :
+ ext3b t1, a1, t1 # U : (stall)
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
+
+ or t0, t1, t1 # e1 : and put it there
+ stl_u t1, 0(a0) # .. e0 : (stall)
+ ret (t9) # e1 :
+
+ cfi_endproc
diff --git a/sysdeps/sw_64/sw6b/stxncpy.S b/sysdeps/sw_64/sw6b/stxncpy.S
new file mode 100644
index 00000000..c47029ea
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/stxncpy.S
@@ -0,0 +1,392 @@
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Copy no more than COUNT bytes of the null-terminated string from
+ SRC to DST.
+
+ This is an internal routine used by strncpy, stpncpy, and strncat.
+ As such, it uses special linkage conventions to make implementation
+ of these public functions more efficient.
+
+ On input:
+ t9 = return address
+ a0 = DST
+ a1 = SRC
+ a2 = COUNT
+
+ Furthermore, COUNT may not be zero.
+
+ On output:
+ t0 = last word written
+ t8 = bitmask (with one bit set) indicating the last byte written
+ t10 = bitmask (with one bit set) indicating the byte position of
+ the end of the range specified by COUNT
+ a0 = unaligned address of the last *word* written
+ a2 = the number of full words left in COUNT
+
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+ .text
+ .type __stxncpy, @function
+ .globl __stxncpy
+ .usepv __stxncpy, no
+
+ cfi_startproc
+ cfi_return_column (t9)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == the first source word. */
+ .align 4
+stxncpy_aligned:
+ /* Create the 1st output word and detect 0's in the 1st input word. */
+ ldi t2, -1 # E : build a mask against false zero
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
+ mask7b t1, a1, t3 # U :
+ ornot t1, t2, t2 # E : (stall)
+
+ mask3b t0, a1, t0 # U : assemble the first output word
+ cmpgeb zero, t2, t7 # E : bits set iff null found
+ or t0, t3, t0 # E : (stall)
+ beq a2, $a_eoc # U :
+
+ bne t7, $a_eos # U :
+ nop
+ nop
+ nop
+
+ /* On entry to this basic block:
+ t0 == a source word not containing a null. */
+
+ /*
+ * nops here to:
+ * separate store quads from load quads
+ * limit of 1 bcond/quad to permit training
+ */
+$a_loop:
+ stl_u t0, 0(a0) # L :
+ addl a0, 8, a0 # E :
+ subl a2, 1, a2 # E :
+ nop
+
+ ldl_u t0, 0(a1) # L :
+ addl a1, 8, a1 # E :
+ cmpgeb zero, t0, t7 # E :
+ beq a2, $a_eoc # U :
+
+ beq t7, $a_loop # U :
+ nop
+ nop
+ nop
+
+ /* Take care of the final (partial) word store. At this point
+ the end-of-count bit is set in t7 iff it applies.
+
+ On entry to this basic block we have:
+ t0 == the source word containing the null
+ t7 == the cmpgeb mask that found it. */
+$a_eos:
+ negl t7, t8 # E : find low bit set
+ and t7, t8, t8 # E : (stall)
+ /* For the sake of the cache, don't read a destination word
+ if we're not going to need it. */
+ and t8, 0x80, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ /* We're doing a partial word store and so need to combine
+ our source and original destination words. */
+ ldl_u t1, 0(a0) # L :
+ subl t8, 1, t6 # E :
+ or t8, t6, t7 # E : (stall)
+ zapnot t0, t7, t0 # U : clear src bytes > null (stall)
+
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
+ or t0, t1, t0 # e1 : (stall)
+ nop
+ nop
+
+1: stl_u t0, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ /* Add the end-of-count bit to the eos detection bitmask. */
+$a_eoc:
+ or t10, t7, t7 # E :
+ br $a_eos # L0 : Latency=3
+ nop
+ nop
+
+ .align 4
+__stxncpy:
+ /* Are source and destination co-aligned? */
+ ldi t2, -1 # E :
+ xor a0, a1, t1 # E :
+ and a0, 7, t0 # E : find dest misalignment
+ nop # E :
+
+ srl t2, 1, t2 # U :
+ and t1, 7, t1 # E :
+ sellt a2, t2, a2, a2 # E : bound count to LONG_MAX (stall)
+ nop # E :
+
+ addl a2, t0, a2 # E : bias count by dest misalignment
+ subl a2, 1, a2 # E : (stall)
+ and a2, 7, t2 # E : (stall)
+ ldi t10, 1 # E :
+
+ srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8
+ sll t10, t2, t10 # U : t10 = bitmask of last count byte
+ nop # E :
+ bne t1, $unaligned # U : (stall)
+
+ /* We are co-aligned; take care of a partial first word. */
+ ldl_u t1, 0(a1) # L : load first src word
+ addl a1, 8, a1 # E :
+ beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
+ ldl_u t0, 0(a0) # L :
+
+ br stxncpy_aligned # U :
+ nop
+ nop
+ nop
+
+
+
+/* The source and destination are not co-aligned. Align the destination
+ and cope. We have to be very careful about not reading too much and
+ causing a SEGV. */
+
+ .align 4
+$u_head:
+ /* We know just enough now to be able to assemble the first
+ full source word. We can still find a zero at the end of it
+ that prevents us from outputting the whole thing.
+
+ On entry to this basic block:
+ t0 == the first dest word, unmasked
+ t1 == the shifted low bits of the first source word
+ t6 == bytemask that is -1 in dest word bytes */
+
+ ldl_u t2, 8(a1) # L : Latency=3 load second src word
+ addl a1, 8, a1 # E :
+ mask3b t0, a0, t0 # U : mask trailing garbage in dst
+ ext7b t2, a1, t4 # U : (3 cycle stall on t2)
+
+ or t1, t4, t1 # E : first aligned src word complete (stall)
+ mask7b t1, a0, t1 # U : mask leading garbage in src (stall)
+ or t0, t1, t0 # E : first output word complete (stall)
+ or t0, t6, t6 # E : mask original data for zero test (stall)
+
+ cmpgeb zero, t6, t7 # E :
+ beq a2, $u_eocfin # U :
+ ldi t6, -1 # E :
+ nop
+
+ bne t7, $u_final # U :
+ mask3b t6, a1, t6 # U : mask out bits already seen
+ stl_u t0, 0(a0) # L : store first output word
+ or t6, t2, t2 # E :
+
+ cmpgeb zero, t2, t7 # E : find nulls in second partial
+ addl a0, 8, a0 # E :
+ subl a2, 1, a2 # E :
+ bne t7, $u_late_head_exit # U :
+
+ /* Finally, we've got all the stupid leading edge cases taken care
+ of and we can set up to enter the main loop. */
+ ext3b t2, a1, t1 # U : position hi-bits of lo word
+ beq a2, $u_eoc # U :
+ ldl_u t2, 8(a1) # L : read next high-order source word
+ addl a1, 8, a1 # E :
+
+ ext7b t2, a1, t0 # U : position lo-bits of hi word (stall)
+ cmpgeb zero, t2, t7 # E :
+ nop
+ bne t7, $u_eos # U :
+
+ /* Unaligned copy main loop. In order to avoid reading too much,
+ the loop is structured to detect zeros in aligned source words.
+ This has, unfortunately, effectively pulled half of a loop
+ iteration out into the head and half into the tail, but it does
+ prevent nastiness from accumulating in the very thing we want
+ to run as fast as possible.
+
+ On entry to this basic block:
+ t0 == the shifted low-order bits from the current source word
+ t1 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word
+
+ We further know that t2 does not contain a null terminator. */
+
+ .align 4
+$u_loop:
+ or t0, t1, t0 # E : current dst word now complete
+ subl a2, 1, a2 # E : decrement word count
+ ext3b t2, a1, t1 # U : extract high bits for next time
+ addl a0, 8, a0 # E :
+
+ stl_u t0, -8(a0) # L : save the current word
+ beq a2, $u_eoc # U :
+ ldl_u t2, 8(a1) # L : Latency=3 load high word for next time
+ addl a1, 8, a1 # E :
+
+ ext7b t2, a1, t0 # U : extract low bits (2 cycle stall)
+ cmpgeb zero, t2, t7 # E : test new word for eos
+ nop
+ beq t7, $u_loop # U :
+
+ /* We've found a zero somewhere in the source word we just read.
+ If it resides in the lower half, we have one (probably partial)
+ word to write out, and if it resides in the upper half, we
+ have one full and one partial word left to write out.
+
+ On entry to this basic block:
+ t0 == the shifted low-order bits from the current source word
+ t1 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word. */
+$u_eos:
+ or t0, t1, t0 # E : first (partial) source word complete
+ nop
+ cmpgeb zero, t0, t7 # E : is the null in this first bit? (stall)
+ bne t7, $u_final # U : (stall)
+
+ stl_u t0, 0(a0) # L : the null was in the high-order bits
+ addl a0, 8, a0 # E :
+ subl a2, 1, a2 # E :
+ nop
+
+$u_late_head_exit:
+ ext3b t2, a1, t0 # U :
+ cmpgeb zero, t0, t7 # E :
+ or t7, t10, t6 # E : (stall)
+ seleq a2, t6, t7, t7 # E : Latency=2, extra map slot (stall)
+
+ /* Take care of a final (probably partial) result word.
+ On entry to this basic block:
+ t0 == assembled source word
+ t7 == cmpgeb mask that found the null. */
+$u_final:
+ negl t7, t6 # E : isolate low bit set
+ and t6, t7, t8 # E : (stall)
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
+ bne t6, 1f # U : (stall)
+
+ ldl_u t1, 0(a0) # L :
+ subl t8, 1, t6 # E :
+ or t6, t8, t7 # E : (stall)
+ zapnot t0, t7, t0 # U : kill source bytes > null
+
+ zap t1, t7, t1 # U : kill dest bytes <= null
+ or t0, t1, t0 # E : (stall)
+ nop
+ nop
+
+1: stl_u t0, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+
+ /* Got to end-of-count before end of string.
+ On entry to this basic block:
+ t1 == the shifted high-order bits from the previous source word */
+$u_eoc:
+ and a1, 7, t6 # E :
+ sll t10, t6, t6 # U : (stall)
+ and t6, 0xff, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ ldl_u t2, 8(a1) # L : load final src word
+ nop
+ ext7b t2, a1, t0 # U : extract low bits for last word (stall)
+ or t1, t0, t1 # E : (stall)
+
+1: cmpgeb zero, t1, t7 # E :
+ mov t1, t0
+
+$u_eocfin: # end-of-count, final word
+ or t10, t7, t7 # E :
+ br $u_final # L0 : Latency=3
+
+ /* Unaligned copy entry point. */
+ .align 4
+$unaligned:
+
+ ldl_u t1, 0(a1) # L : load first source word
+ and a0, 7, t4 # E : find dest misalignment
+ and a1, 7, t5 # E : find src misalignment
+ /* Conditionally load the first destination word and a bytemask
+ with 0xff indicating that the destination byte is sacrosanct. */
+ mov zero, t0 # E :
+
+ mov zero, t6 # E :
+ beq t4, 1f # U :
+ ldl_u t0, 0(a0) # L :
+ ldi t6, -1 # E :
+
+ mask3b t6, a0, t6 # U :
+ nop
+ nop
+1: subl a1, t4, a1 # E : sub dest misalignment from src addr
+
+ /* If source misalignment is larger than dest misalignment, we need
+ extra startup checks to avoid SEGV. */
+
+ cmplt t4, t5, t8 # E :
+ ext3b t1, a1, t1 # U : shift src into place
+ ldi t2, -1 # E : for creating masks later
+ beq t8, $u_head # U : (stall)
+
+ mask7b t2, t5, t2 # U : begin src byte validity mask
+ cmpgeb zero, t1, t7 # E : is there a zero?
+ ext3b t2, a1, t2 # U :
+ or t7, t10, t5 # E : test for end-of-count too
+
+ cmpgeb zero, t2, t3 # E :
+ seleq a2, t5, t7, t7 # E : Latency=2, extra map slot
+ nop # E : keep with seleq
+ andnot t7, t3, t7 # E : (stall)
+
+ beq t7, $u_head # U :
+ /* At this point we've found a zero in the first partial word of
+ the source. We need to isolate the valid source data and mask
+ it into the original destination data. (Incidentally, we know
+ that we'll need at least one byte of that original dest word.) */
+ ldl_u t0, 0(a0) # L :
+ negl t7, t6 # E : build bitmask of bytes <= zero
+ mask7b t1, t4, t1 # U :
+
+ and t6, t7, t8 # E :
+ subl t8, 1, t6 # E : (stall)
+ or t6, t8, t7 # E : (stall)
+ zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
+
+ zapnot t1, t7, t1 # U : to source validity mask
+ andnot t0, t2, t0 # E : zero place for source to reside
+ or t0, t1, t0 # E : and put it there (stall both t0, t1)
+ stl_u t0, 0(a0) # L : (stall)
+
+ ret (t9) # L0 : Latency=3
+
+ cfi_endproc
diff --git a/sysdeps/sw_64/sw6b/sub_n.S b/sysdeps/sw_64/sw6b/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw6b/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ subl $4,$0,$20 # 1st main sub
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ subl $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ subl $6,$28,$22 # 3rd main sub
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ subl $4,$28,$20 # 1st main sub
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ subl $5,$28,$21 # 2nd main sub
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ subl $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ subl $4,$28,$20 # main sub
+ ldl $1,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ subl $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
diff --git a/sysdeps/sw_64/sw8a/add_n.S b/sysdeps/sw_64/sw8a/add_n.S
new file mode 100644
index 00000000..86e9f9ae
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/add_n.S
@@ -0,0 +1,146 @@
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_add_n
+ .ent __mpn_add_n
+__mpn_add_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ addl $0,$4,$20 # 1st main add
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $20,$0,$25 # compute cy from last add
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ addl $5,$28,$21 # 2nd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $21,$28,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ addl $28,$6,$22 # 3rd main add
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ addl $4,$28,$20 # 1st main add
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $20,$28,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ addl $5,$28,$21 # 2nd main add
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ addl $28,$6,$22 # 3rd main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $22,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ addl $28,$7,$23 # 4th main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $23,$28,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ addl $4,$28,$20 # main add
+ ldl $4,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $20,$28,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ addl $4,$28,$20 # main add
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $20,$28,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_add_n
diff --git a/sysdeps/sw_64/sw8a/addmul_1.S b/sysdeps/sw_64/sw8a/addmul_1.S
new file mode 100644
index 00000000..95487c26
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/addmul_1.S
@@ -0,0 +1,475 @@
+ # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ #
+ # This code was written in close cooperation with pipeline expert
+ # . Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldl in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
+ # like not to have a ldl or stl to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $18, 1, $18 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ ldi $30, -240($30)
+ stl $9, 8($30)
+ stl $10, 16($30)
+ stl $11, 24($30)
+ stl $12, 32($30)
+ stl $13, 40($30)
+ stl $14, 48($30)
+ stl $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ subl $20, 1, $20 # size--
+ addl $5, $3, $3
+ cmpult $3, $5, $4
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subl $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldl $2, 0($17) # $2 = s1_limb
+ addl $17, 8, $17 # s1_ptr++
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mull $2, $19, $3 # $3 = prod_low
+ ldl $5, 0($16) # $5 = *res_ptr
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $5, $0, $0 # combine carries
+ addl $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addl $5, $3, $3
+ cmpult $3, $5, $5
+ stl $3, 0($16)
+ addl $16, 8, $16 # res_ptr++
+ addl $0, $5, $0
+
+$Lunroll:
+ ldi $17, -16($17) # L1 bookkeeping
+ ldi $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldl $2, 16($17) # L1
+ ldl $3, 24($17) # L1
+ ldi $18, -1($18) # L1 bookkeeping
+ ldl $6, 16($16) # L1
+ ldl $7, 24($16) # L1
+ ldl $0, 32($17) # L1
+ mull $19, $2, $13 # U1
+ ldl $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mull $19, $3, $15 # U1
+ ldi $17, 64($17) # L1 bookkeeping
+ ldl $4, 32($16) # L1
+ ldl $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldl $2, -16($17) # L1
+ mull $19, $0, $9 # U1
+ ldl $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ mull $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+ mull $19, $3, $15 # U1
+ addl $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, 16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $18, -1($18) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 32($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $17, 64($17) # L1 bookkeeping
+ addl $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ ldl $2, -16($17) # L1
+
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ ldl $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ ldi $16, 64($16) # L1 bookkeeping
+ addl $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ ldl $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ ldl $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ ldl $0, 0($17) # L1
+
+ mull $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ ldl $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addl $4, $9, $4 # L0 lo + acc
+ stl $22, -48($16) # L0
+ stl $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mull $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addl $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ ldl $4, 0($16) # L1
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ ldl $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ mull $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addl $6, $13, $6 # L0 lo + acc
+ stl $22, -32($16) # L0
+ stl $23, -24($16) # L1
+ mull $19, $1, $11 # U1
+ addl $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addl $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addl $14, $20, $14 # U0 hi mul + carry
+ addl $7, $15, $23 # L0 lo + acc
+ addl $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addl $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addl $8, $20, $8 # U0 hi mul + carry
+ addl $4, $9, $4 # U0 lo + acc
+ stl $22, -16($16) # L0
+ stl $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addl $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addl $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addl $10, $20, $10 # U0 hi mul + carry
+ addl $5, $11, $23 # L0 lo + acc
+ addl $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addl $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addl $12, $20, $12 # U0 hi mul + carry
+ stl $22, 0($16) # L0
+ stl $23, 8($16) # L1
+ addl $12, $21, $0 # U0 hi mul + carry
+
+ ldl $9, 8($30)
+ ldl $10, 16($30)
+ ldl $11, 24($30)
+ ldl $12, 32($30)
+ ldl $13, 40($30)
+ ldl $14, 48($30)
+ ldl $15, 56($30)
+ ldi $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/sw_64/sw8a/lshift.S b/sysdeps/sw_64/sw8a/lshift.S
new file mode 100644
index 00000000..76f1fb0e
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/lshift.S
@@ -0,0 +1,172 @@
+ # Sw_64 __mpn_lshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_lshift
+ .ent __mpn_lshift
+__mpn_lshift:
+ .frame $30,0,$26,0
+
+ s8addl $18,$17,$17 # make r17 point at end of s1
+ ldl $4,-8($17) # load first limb
+ subl $31,$19,$20
+ s8addl $18,$16,$16 # make r16 point at end of RES
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ srl $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,-16($17)
+ subl $16,8,$16
+ sll $4,$19,$5
+ subl $17,8,$17
+ subl $28,1,$28
+ srl $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,0($16)
+ bne $28,.Loop0
+
+.L0: sll $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,-16($17)
+ subl $18,4,$18
+ ldl $2,-24($17)
+ ldl $3,-32($17)
+ ldl $4,-40($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ ldl $1,-48($17)
+ sll $2,$19,$22
+ ldl $2,-56($17)
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ ldl $3,-64($17)
+ sll $4,$19,$24
+ ldl $4,-72($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+
+ srl $1,$20,$7
+ subl $18,4,$18
+ sll $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ srl $2,$20,$8
+ ldl $1,-80($17)
+ sll $2,$19,$22
+ ldl $2,-88($17)
+
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+
+ srl $3,$20,$5
+ unop # ldl $31,-96($17)
+ sll $3,$19,$23
+ subl $16,32,$16
+
+ srl $4,$20,$6
+ ldl $3,-96($17)
+ sll $4,$19,$24
+ ldl $4,-104($17)
+
+ subl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ stl $5,-24($16)
+ or $7,$24,$7
+ stl $6,-32($16)
+ or $8,$21,$8
+ srl $3,$20,$5
+ sll $3,$19,$23
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 2/2
+ stl $7,-40($16)
+ or $5,$22,$5
+ stl $8,-48($16)
+ or $6,$23,$6
+ stl $5,-56($16)
+ stl $6,-64($16)
+ # cool down phase 2/3
+ stl $24,-72($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: srl $1,$20,$7
+ sll $1,$19,$21
+ srl $2,$20,$8
+ sll $2,$19,$22
+ srl $3,$20,$5
+ or $7,$24,$7
+ sll $3,$19,$23
+ or $8,$21,$8
+ srl $4,$20,$6
+ sll $4,$19,$24
+ # cool down phase 1/2
+ stl $7,-8($16)
+ or $5,$22,$5
+ stl $8,-16($16)
+ or $6,$23,$6
+ stl $5,-24($16)
+ stl $6,-32($16)
+ stl $24,-40($16)
+ ret $31,($26),1
+
+.Lend: stl $24,-8($16)
+ ret $31,($26),1
+ .end __mpn_lshift
diff --git a/sysdeps/sw_64/sw8a/rshift.S b/sysdeps/sw_64/sw8a/rshift.S
new file mode 100644
index 00000000..ec2a78b0
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/rshift.S
@@ -0,0 +1,170 @@
+ # Sw_64 __mpn_rshift --
+
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr r16
+ # s1_ptr r17
+ # size r18
+ # cnt r19
+
+ # This code runs at 3.25 cycles/limb on the sw_64.
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_rshift
+ .ent __mpn_rshift
+__mpn_rshift:
+ .frame $30,0,$26,0
+
+ ldl $4,0($17) # load first limb
+ subl $31,$19,$20
+ subl $18,1,$18
+ and $18,4-1,$28 # number of limbs in first loop
+ sll $4,$20,$0 # compute function result
+
+ beq $28,.L0
+ subl $18,$28,$18
+
+ .align 3
+.Loop0: ldl $3,8($17)
+ addl $16,8,$16
+ srl $4,$19,$5
+ addl $17,8,$17
+ subl $28,1,$28
+ sll $3,$20,$6
+ or $3,$3,$4
+ or $5,$6,$8
+ stl $8,-8($16)
+ bne $28,.Loop0
+
+.L0: srl $4,$19,$24
+ beq $18,.Lend
+ # warm up phase 1
+ ldl $1,8($17)
+ subl $18,4,$18
+ ldl $2,16($17)
+ ldl $3,24($17)
+ ldl $4,32($17)
+ beq $18,.Lend1
+ # warm up phase 2
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ ldl $1,40($17)
+ srl $2,$19,$22
+ ldl $2,48($17)
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ ldl $3,56($17)
+ srl $4,$19,$24
+ ldl $4,64($17)
+ subl $18,4,$18
+ beq $18,.Lend2
+ .align 4
+ # main loop
+.Loop: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+
+ sll $1,$20,$7
+ subl $18,4,$18
+ srl $1,$19,$21
+ unop # ldl $31,-96($17)
+
+ sll $2,$20,$8
+ ldl $1,72($17)
+ srl $2,$19,$22
+ ldl $2,80($17)
+
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+
+ sll $3,$20,$5
+ unop # ldl $31,-96($17)
+ srl $3,$19,$23
+ addl $16,32,$16
+
+ sll $4,$20,$6
+ ldl $3,88($17)
+ srl $4,$19,$24
+ ldl $4,96($17)
+
+ addl $17,32,$17
+ bne $18,.Loop
+ # cool down phase 2/1
+.Lend2: stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ stl $5,16($16)
+ or $7,$24,$7
+ stl $6,24($16)
+ or $8,$21,$8
+ sll $3,$20,$5
+ srl $3,$19,$23
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 2/2
+ stl $7,32($16)
+ or $5,$22,$5
+ stl $8,40($16)
+ or $6,$23,$6
+ stl $5,48($16)
+ stl $6,56($16)
+ # cool down phase 2/3
+ stl $24,64($16)
+ ret $31,($26),1
+
+ # cool down phase 1/1
+.Lend1: sll $1,$20,$7
+ srl $1,$19,$21
+ sll $2,$20,$8
+ srl $2,$19,$22
+ sll $3,$20,$5
+ or $7,$24,$7
+ srl $3,$19,$23
+ or $8,$21,$8
+ sll $4,$20,$6
+ srl $4,$19,$24
+ # cool down phase 1/2
+ stl $7,0($16)
+ or $5,$22,$5
+ stl $8,8($16)
+ or $6,$23,$6
+ stl $5,16($16)
+ stl $6,24($16)
+ stl $24,32($16)
+ ret $31,($26),1
+
+.Lend: stl $24,0($16)
+ ret $31,($26),1
+ .end __mpn_rshift
diff --git a/sysdeps/sw_64/sw8a/sub_n.S b/sysdeps/sw_64/sw8a/sub_n.S
new file mode 100644
index 00000000..95c257f7
--- /dev/null
+++ b/sysdeps/sw_64/sw8a/sub_n.S
@@ -0,0 +1,147 @@
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # s2_ptr $18
+ # size $19
+
+ .set noreorder
+ .set noat
+.text
+ .align 3
+ .globl __mpn_sub_n
+ .ent __mpn_sub_n
+__mpn_sub_n:
+ .frame $30,0,$26,0
+
+ or $31,$31,$25 # clear cy
+ subl $19,4,$19 # decr loop cnt
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+ ldl $0,0($18)
+ ldl $1,8($18)
+ ldl $4,0($17)
+ ldl $5,8($17)
+ addl $17,32,$17 # update s1_ptr
+ ldl $2,16($18)
+ subl $4,$0,$20 # 1st main sub
+ ldl $3,24($18)
+ subl $19,4,$19 # decr loop cnt
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last sub
+ ldl $7,-8($17)
+ addl $1,$25,$28 # cy add
+ addl $18,32,$18 # update s2_ptr
+ subl $5,$28,$21 # 2nd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+ .align 4
+.Loop: cmpult $5,$21,$25 # compute cy from last add
+ ldl $0,0($18)
+ or $8,$25,$25 # combine cy from the two fadds
+ ldl $1,8($18)
+ addl $2,$25,$28 # cy add
+ ldl $4,0($17)
+ subl $6,$28,$22 # 3rd main sub
+ ldl $5,8($17)
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ addl $17,32,$17 # update s1_ptr
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ addl $0,$25,$28 # cy add
+ ldl $2,16($18)
+ subl $4,$28,$20 # 1st main sub
+ ldl $3,24($18)
+ cmpult $28,$25,$8 # compute cy from last add
+ ldl $6,-16($17)
+ cmpult $4,$20,$25 # compute cy from last add
+ ldl $7,-8($17)
+ or $8,$25,$25 # combine cy from the two fadds
+ subl $19,4,$19 # decr loop cnt
+ stl $22,-16($16)
+ addl $1,$25,$28 # cy add
+ stl $23,-8($16)
+ subl $5,$28,$21 # 2nd main sub
+ addl $18,32,$18 # update s2_ptr
+ cmpult $28,$25,$8 # compute cy from last add
+ bge $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $2,$25,$28 # cy add
+ subl $6,$28,$22 # 3rd main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $6,$22,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+ stl $21,8($16)
+ addl $3,$25,$28 # cy add
+ subl $7,$28,$23 # 4th main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $7,$23,$25 # compute cy from last add
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,32,$16 # update res_ptr
+ stl $22,-16($16)
+ stl $23,-8($16)
+.Lend2: addl $19,4,$19 # restore loop cnt
+ beq $19,.Lret
+ # Start software pipeline for 2nd loop
+ ldl $0,0($18)
+ ldl $4,0($17)
+ subl $19,1,$19
+ beq $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+ .align 4
+.Loop0: addl $0,$25,$28 # cy add
+ ldl $0,8($18)
+ subl $4,$28,$20 # main sub
+ ldl $1,8($17)
+ addl $18,8,$18
+ cmpult $28,$25,$8 # compute cy from last add
+ addl $17,8,$17
+ stl $20,0($16)
+ cmpult $4,$20,$25 # compute cy from last add
+ subl $19,1,$19 # decr loop cnt
+ or $8,$25,$25 # combine cy from the two fadds
+ addl $16,8,$16
+ or $1,$31,$4
+ bne $19,.Loop0
+.Lend0: addl $0,$25,$28 # cy add
+ subl $4,$28,$20 # main sub
+ cmpult $28,$25,$8 # compute cy from last add
+ cmpult $4,$20,$25 # compute cy from last add
+ stl $20,0($16)
+ or $8,$25,$25 # combine cy from the two fadds
+
+.Lret: or $25,$31,$0 # return cy
+ ret $31,($26),1
+ .end __mpn_sub_n
diff --git a/sysdeps/sw_64/udiv_qrnnd.S b/sysdeps/sw_64/udiv_qrnnd.S
new file mode 100644
index 00000000..054034cd
--- /dev/null
+++ b/sysdeps/sw_64/udiv_qrnnd.S
@@ -0,0 +1,159 @@
+ # Sw_64 1621 __udiv_qrnnd
+
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+#include <sysdep.h>
+
+ .set noreorder
+ .set noat
+
+ .text
+
+LEAF(__udiv_qrnnd, 0)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ ldi AT, _mcount
+ call AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+#define cnt $2
+#define tmp $3
+#define rem_ptr $16
+#define n1 $17
+#define n0 $18
+#define d $19
+#define qb $20
+
+ ldi cnt,16
+ blt d,$largedivisor
+
+$loop1: cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule d,n1,qb
+ subl n1,d,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule d,n1,qb
+ subl n1,d,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule d,n1,qb
+ subl n1,d,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule d,n1,qb
+ subl n1,d,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ subl cnt,1,cnt
+ bgt cnt,$loop1
+ stl n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+$largedivisor:
+ and n0,1,$4
+
+ srl n0,1,n0
+ sll n1,63,tmp
+ or tmp,n0,n0
+ srl n1,1,n1
+
+ and d,1,$6
+ srl d,1,$5
+ addl $5,$6,$5
+
+$loop2: cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule $5,n1,qb
+ subl n1,$5,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule $5,n1,qb
+ subl n1,$5,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule $5,n1,qb
+ subl n1,$5,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ cmplt n0,0,tmp
+ addl n1,n1,n1
+ bis n1,tmp,n1
+ addl n0,n0,n0
+ cmpule $5,n1,qb
+ subl n1,$5,tmp
+ selne qb,tmp,n1,n1
+ bis n0,qb,n0
+ subl cnt,1,cnt
+ bgt cnt,$loop2
+
+ addl n1,n1,n1
+ addl $4,n1,n1
+ bne $6,$Odd
+ stl n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+$Odd:
+ /* q' in n0. r' in n1 */
+ addl n1,n0,n1
+
+ cmpult n1,n0,tmp # tmp := carry from addl
+ subl n1,d,AT
+ addl n0,tmp,n0
+ selne tmp,AT,n1,n1
+
+ cmpult n1,d,tmp
+ addl n0,1,AT
+ seleq tmp,AT,n0,n0
+ subl n1,d,AT
+ seleq tmp,AT,n1,n1
+
+ stl n1,0(rem_ptr)
+ bis $31,n0,$0
+ ret $31,($26),1
+
+ .end __udiv_qrnnd
--
2.25.1
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/junhe_arm/glibc.git
git@gitee.com:junhe_arm/glibc.git
junhe_arm
glibc
glibc
master

搜索帮助