blake3_amd64.s 127 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564
  1. // Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT.
  2. #include "textflag.h"
  3. DATA iv<>+0(SB)/4, $0x6a09e667
  4. DATA iv<>+4(SB)/4, $0xbb67ae85
  5. DATA iv<>+8(SB)/4, $0x3c6ef372
  6. DATA iv<>+12(SB)/4, $0xa54ff53a
  7. GLOBL iv<>(SB), RODATA|NOPTR, $16
  8. DATA seq<>+0(SB)/4, $0x00000000
  9. DATA seq<>+4(SB)/4, $0x00000001
  10. DATA seq<>+8(SB)/4, $0x00000002
  11. DATA seq<>+12(SB)/4, $0x00000003
  12. DATA seq<>+16(SB)/4, $0x00000004
  13. DATA seq<>+20(SB)/4, $0x00000005
  14. DATA seq<>+24(SB)/4, $0x00000006
  15. DATA seq<>+28(SB)/4, $0x00000007
  16. DATA seq<>+32(SB)/4, $0x00000008
  17. DATA seq<>+36(SB)/4, $0x00000009
  18. DATA seq<>+40(SB)/4, $0x0000000a
  19. DATA seq<>+44(SB)/4, $0x0000000b
  20. DATA seq<>+48(SB)/4, $0x0000000c
  21. DATA seq<>+52(SB)/4, $0x0000000d
  22. DATA seq<>+56(SB)/4, $0x0000000e
  23. DATA seq<>+60(SB)/4, $0x0000000f
  24. GLOBL seq<>(SB), RODATA|NOPTR, $64
  25. DATA seq64<>+0(SB)/8, $0x0000000000000000
  26. DATA seq64<>+8(SB)/8, $0x0000000000000001
  27. DATA seq64<>+16(SB)/8, $0x0000000000000002
  28. DATA seq64<>+24(SB)/8, $0x0000000000000003
  29. DATA seq64<>+32(SB)/8, $0x0000000000000004
  30. DATA seq64<>+40(SB)/8, $0x0000000000000005
  31. DATA seq64<>+48(SB)/8, $0x0000000000000006
  32. DATA seq64<>+56(SB)/8, $0x0000000000000007
  33. GLOBL seq64<>(SB), RODATA|NOPTR, $64
  34. DATA shuffle_rot8<>+0(SB)/4, $0x00030201
  35. DATA shuffle_rot8<>+4(SB)/4, $0x04070605
  36. DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09
  37. DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d
  38. DATA shuffle_rot8<>+16(SB)/4, $0x10131211
  39. DATA shuffle_rot8<>+20(SB)/4, $0x14171615
  40. DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19
  41. DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d
  42. GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32
  43. DATA shuffle_rot16<>+0(SB)/4, $0x01000302
  44. DATA shuffle_rot16<>+4(SB)/4, $0x05040706
  45. DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a
  46. DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e
  47. DATA shuffle_rot16<>+16(SB)/4, $0x11101312
  48. DATA shuffle_rot16<>+20(SB)/4, $0x15141716
  49. DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a
  50. DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
  51. GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
  52. // func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
  53. // Requires: AVX512F
  54. TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
  55. MOVQ out+0(FP), AX
  56. MOVQ block+8(FP), CX
  57. MOVQ cv+16(FP), DX
  58. // Initialize block vectors
  59. VPBROADCASTD (CX), Z1
  60. VPBROADCASTD 4(CX), Z3
  61. VPBROADCASTD 8(CX), Z5
  62. VPBROADCASTD 12(CX), Z7
  63. VPBROADCASTD 16(CX), Z9
  64. VPBROADCASTD 20(CX), Z11
  65. VPBROADCASTD 24(CX), Z13
  66. VPBROADCASTD 28(CX), Z15
  67. VPBROADCASTD 32(CX), Z17
  68. VPBROADCASTD 36(CX), Z19
  69. VPBROADCASTD 40(CX), Z21
  70. VPBROADCASTD 44(CX), Z23
  71. VPBROADCASTD 48(CX), Z25
  72. VPBROADCASTD 52(CX), Z27
  73. VPBROADCASTD 56(CX), Z29
  74. VPBROADCASTD 60(CX), Z31
  75. // Initialize state vectors
  76. VPBROADCASTD (DX), Z0
  77. VPBROADCASTD 4(DX), Z2
  78. VPBROADCASTD 8(DX), Z4
  79. VPBROADCASTD 12(DX), Z6
  80. VPBROADCASTD 16(DX), Z8
  81. VPBROADCASTD 20(DX), Z10
  82. VPBROADCASTD 24(DX), Z12
  83. VPBROADCASTD 28(DX), Z14
  84. VPBROADCASTD iv<>+0(SB), Z16
  85. VPBROADCASTD iv<>+4(SB), Z18
  86. VPBROADCASTD iv<>+8(SB), Z20
  87. VPBROADCASTD iv<>+12(SB), Z22
  88. VPBROADCASTD counter+24(FP), Z24
  89. VPADDD seq<>+0(SB), Z24, Z24
  90. VPCMPUD $0x01, seq<>+0(SB), Z24, K1
  91. VPBROADCASTD counter+28(FP), Z26
  92. VPADDD.BCST seq<>+4(SB), Z26, K1, Z26
  93. VPBROADCASTD blockLen+32(FP), Z28
  94. VPBROADCASTD flags+36(FP), Z30
  95. // Round 1
  96. VPADDD Z0, Z8, Z0
  97. VPADDD Z1, Z0, Z0
  98. VPXORD Z24, Z0, Z24
  99. VPRORD $0x10, Z24, Z24
  100. VPADDD Z16, Z24, Z16
  101. VPXORD Z8, Z16, Z8
  102. VPRORD $0x0c, Z8, Z8
  103. VPADDD Z0, Z8, Z0
  104. VPADDD Z3, Z0, Z0
  105. VPXORD Z24, Z0, Z24
  106. VPRORD $0x08, Z24, Z24
  107. VPADDD Z16, Z24, Z16
  108. VPXORD Z8, Z16, Z8
  109. VPRORD $0x07, Z8, Z8
  110. VPADDD Z2, Z10, Z2
  111. VPADDD Z5, Z2, Z2
  112. VPXORD Z26, Z2, Z26
  113. VPRORD $0x10, Z26, Z26
  114. VPADDD Z18, Z26, Z18
  115. VPXORD Z10, Z18, Z10
  116. VPRORD $0x0c, Z10, Z10
  117. VPADDD Z2, Z10, Z2
  118. VPADDD Z7, Z2, Z2
  119. VPXORD Z26, Z2, Z26
  120. VPRORD $0x08, Z26, Z26
  121. VPADDD Z18, Z26, Z18
  122. VPXORD Z10, Z18, Z10
  123. VPRORD $0x07, Z10, Z10
  124. VPADDD Z4, Z12, Z4
  125. VPADDD Z9, Z4, Z4
  126. VPXORD Z28, Z4, Z28
  127. VPRORD $0x10, Z28, Z28
  128. VPADDD Z20, Z28, Z20
  129. VPXORD Z12, Z20, Z12
  130. VPRORD $0x0c, Z12, Z12
  131. VPADDD Z4, Z12, Z4
  132. VPADDD Z11, Z4, Z4
  133. VPXORD Z28, Z4, Z28
  134. VPRORD $0x08, Z28, Z28
  135. VPADDD Z20, Z28, Z20
  136. VPXORD Z12, Z20, Z12
  137. VPRORD $0x07, Z12, Z12
  138. VPADDD Z6, Z14, Z6
  139. VPADDD Z13, Z6, Z6
  140. VPXORD Z30, Z6, Z30
  141. VPRORD $0x10, Z30, Z30
  142. VPADDD Z22, Z30, Z22
  143. VPXORD Z14, Z22, Z14
  144. VPRORD $0x0c, Z14, Z14
  145. VPADDD Z6, Z14, Z6
  146. VPADDD Z15, Z6, Z6
  147. VPXORD Z30, Z6, Z30
  148. VPRORD $0x08, Z30, Z30
  149. VPADDD Z22, Z30, Z22
  150. VPXORD Z14, Z22, Z14
  151. VPRORD $0x07, Z14, Z14
  152. VPADDD Z0, Z10, Z0
  153. VPADDD Z17, Z0, Z0
  154. VPXORD Z30, Z0, Z30
  155. VPRORD $0x10, Z30, Z30
  156. VPADDD Z20, Z30, Z20
  157. VPXORD Z10, Z20, Z10
  158. VPRORD $0x0c, Z10, Z10
  159. VPADDD Z0, Z10, Z0
  160. VPADDD Z19, Z0, Z0
  161. VPXORD Z30, Z0, Z30
  162. VPRORD $0x08, Z30, Z30
  163. VPADDD Z20, Z30, Z20
  164. VPXORD Z10, Z20, Z10
  165. VPRORD $0x07, Z10, Z10
  166. VPADDD Z2, Z12, Z2
  167. VPADDD Z21, Z2, Z2
  168. VPXORD Z24, Z2, Z24
  169. VPRORD $0x10, Z24, Z24
  170. VPADDD Z22, Z24, Z22
  171. VPXORD Z12, Z22, Z12
  172. VPRORD $0x0c, Z12, Z12
  173. VPADDD Z2, Z12, Z2
  174. VPADDD Z23, Z2, Z2
  175. VPXORD Z24, Z2, Z24
  176. VPRORD $0x08, Z24, Z24
  177. VPADDD Z22, Z24, Z22
  178. VPXORD Z12, Z22, Z12
  179. VPRORD $0x07, Z12, Z12
  180. VPADDD Z4, Z14, Z4
  181. VPADDD Z25, Z4, Z4
  182. VPXORD Z26, Z4, Z26
  183. VPRORD $0x10, Z26, Z26
  184. VPADDD Z16, Z26, Z16
  185. VPXORD Z14, Z16, Z14
  186. VPRORD $0x0c, Z14, Z14
  187. VPADDD Z4, Z14, Z4
  188. VPADDD Z27, Z4, Z4
  189. VPXORD Z26, Z4, Z26
  190. VPRORD $0x08, Z26, Z26
  191. VPADDD Z16, Z26, Z16
  192. VPXORD Z14, Z16, Z14
  193. VPRORD $0x07, Z14, Z14
  194. VPADDD Z6, Z8, Z6
  195. VPADDD Z29, Z6, Z6
  196. VPXORD Z28, Z6, Z28
  197. VPRORD $0x10, Z28, Z28
  198. VPADDD Z18, Z28, Z18
  199. VPXORD Z8, Z18, Z8
  200. VPRORD $0x0c, Z8, Z8
  201. VPADDD Z6, Z8, Z6
  202. VPADDD Z31, Z6, Z6
  203. VPXORD Z28, Z6, Z28
  204. VPRORD $0x08, Z28, Z28
  205. VPADDD Z18, Z28, Z18
  206. VPXORD Z8, Z18, Z8
  207. VPRORD $0x07, Z8, Z8
  208. // Round 2
  209. VPADDD Z0, Z8, Z0
  210. VPADDD Z5, Z0, Z0
  211. VPXORD Z24, Z0, Z24
  212. VPRORD $0x10, Z24, Z24
  213. VPADDD Z16, Z24, Z16
  214. VPXORD Z8, Z16, Z8
  215. VPRORD $0x0c, Z8, Z8
  216. VPADDD Z0, Z8, Z0
  217. VPADDD Z13, Z0, Z0
  218. VPXORD Z24, Z0, Z24
  219. VPRORD $0x08, Z24, Z24
  220. VPADDD Z16, Z24, Z16
  221. VPXORD Z8, Z16, Z8
  222. VPRORD $0x07, Z8, Z8
  223. VPADDD Z2, Z10, Z2
  224. VPADDD Z7, Z2, Z2
  225. VPXORD Z26, Z2, Z26
  226. VPRORD $0x10, Z26, Z26
  227. VPADDD Z18, Z26, Z18
  228. VPXORD Z10, Z18, Z10
  229. VPRORD $0x0c, Z10, Z10
  230. VPADDD Z2, Z10, Z2
  231. VPADDD Z21, Z2, Z2
  232. VPXORD Z26, Z2, Z26
  233. VPRORD $0x08, Z26, Z26
  234. VPADDD Z18, Z26, Z18
  235. VPXORD Z10, Z18, Z10
  236. VPRORD $0x07, Z10, Z10
  237. VPADDD Z4, Z12, Z4
  238. VPADDD Z15, Z4, Z4
  239. VPXORD Z28, Z4, Z28
  240. VPRORD $0x10, Z28, Z28
  241. VPADDD Z20, Z28, Z20
  242. VPXORD Z12, Z20, Z12
  243. VPRORD $0x0c, Z12, Z12
  244. VPADDD Z4, Z12, Z4
  245. VPADDD Z1, Z4, Z4
  246. VPXORD Z28, Z4, Z28
  247. VPRORD $0x08, Z28, Z28
  248. VPADDD Z20, Z28, Z20
  249. VPXORD Z12, Z20, Z12
  250. VPRORD $0x07, Z12, Z12
  251. VPADDD Z6, Z14, Z6
  252. VPADDD Z9, Z6, Z6
  253. VPXORD Z30, Z6, Z30
  254. VPRORD $0x10, Z30, Z30
  255. VPADDD Z22, Z30, Z22
  256. VPXORD Z14, Z22, Z14
  257. VPRORD $0x0c, Z14, Z14
  258. VPADDD Z6, Z14, Z6
  259. VPADDD Z27, Z6, Z6
  260. VPXORD Z30, Z6, Z30
  261. VPRORD $0x08, Z30, Z30
  262. VPADDD Z22, Z30, Z22
  263. VPXORD Z14, Z22, Z14
  264. VPRORD $0x07, Z14, Z14
  265. VPADDD Z0, Z10, Z0
  266. VPADDD Z3, Z0, Z0
  267. VPXORD Z30, Z0, Z30
  268. VPRORD $0x10, Z30, Z30
  269. VPADDD Z20, Z30, Z20
  270. VPXORD Z10, Z20, Z10
  271. VPRORD $0x0c, Z10, Z10
  272. VPADDD Z0, Z10, Z0
  273. VPADDD Z23, Z0, Z0
  274. VPXORD Z30, Z0, Z30
  275. VPRORD $0x08, Z30, Z30
  276. VPADDD Z20, Z30, Z20
  277. VPXORD Z10, Z20, Z10
  278. VPRORD $0x07, Z10, Z10
  279. VPADDD Z2, Z12, Z2
  280. VPADDD Z25, Z2, Z2
  281. VPXORD Z24, Z2, Z24
  282. VPRORD $0x10, Z24, Z24
  283. VPADDD Z22, Z24, Z22
  284. VPXORD Z12, Z22, Z12
  285. VPRORD $0x0c, Z12, Z12
  286. VPADDD Z2, Z12, Z2
  287. VPADDD Z11, Z2, Z2
  288. VPXORD Z24, Z2, Z24
  289. VPRORD $0x08, Z24, Z24
  290. VPADDD Z22, Z24, Z22
  291. VPXORD Z12, Z22, Z12
  292. VPRORD $0x07, Z12, Z12
  293. VPADDD Z4, Z14, Z4
  294. VPADDD Z19, Z4, Z4
  295. VPXORD Z26, Z4, Z26
  296. VPRORD $0x10, Z26, Z26
  297. VPADDD Z16, Z26, Z16
  298. VPXORD Z14, Z16, Z14
  299. VPRORD $0x0c, Z14, Z14
  300. VPADDD Z4, Z14, Z4
  301. VPADDD Z29, Z4, Z4
  302. VPXORD Z26, Z4, Z26
  303. VPRORD $0x08, Z26, Z26
  304. VPADDD Z16, Z26, Z16
  305. VPXORD Z14, Z16, Z14
  306. VPRORD $0x07, Z14, Z14
  307. VPADDD Z6, Z8, Z6
  308. VPADDD Z31, Z6, Z6
  309. VPXORD Z28, Z6, Z28
  310. VPRORD $0x10, Z28, Z28
  311. VPADDD Z18, Z28, Z18
  312. VPXORD Z8, Z18, Z8
  313. VPRORD $0x0c, Z8, Z8
  314. VPADDD Z6, Z8, Z6
  315. VPADDD Z17, Z6, Z6
  316. VPXORD Z28, Z6, Z28
  317. VPRORD $0x08, Z28, Z28
  318. VPADDD Z18, Z28, Z18
  319. VPXORD Z8, Z18, Z8
  320. VPRORD $0x07, Z8, Z8
  321. // Round 3
  322. VPADDD Z0, Z8, Z0
  323. VPADDD Z7, Z0, Z0
  324. VPXORD Z24, Z0, Z24
  325. VPRORD $0x10, Z24, Z24
  326. VPADDD Z16, Z24, Z16
  327. VPXORD Z8, Z16, Z8
  328. VPRORD $0x0c, Z8, Z8
  329. VPADDD Z0, Z8, Z0
  330. VPADDD Z9, Z0, Z0
  331. VPXORD Z24, Z0, Z24
  332. VPRORD $0x08, Z24, Z24
  333. VPADDD Z16, Z24, Z16
  334. VPXORD Z8, Z16, Z8
  335. VPRORD $0x07, Z8, Z8
  336. VPADDD Z2, Z10, Z2
  337. VPADDD Z21, Z2, Z2
  338. VPXORD Z26, Z2, Z26
  339. VPRORD $0x10, Z26, Z26
  340. VPADDD Z18, Z26, Z18
  341. VPXORD Z10, Z18, Z10
  342. VPRORD $0x0c, Z10, Z10
  343. VPADDD Z2, Z10, Z2
  344. VPADDD Z25, Z2, Z2
  345. VPXORD Z26, Z2, Z26
  346. VPRORD $0x08, Z26, Z26
  347. VPADDD Z18, Z26, Z18
  348. VPXORD Z10, Z18, Z10
  349. VPRORD $0x07, Z10, Z10
  350. VPADDD Z4, Z12, Z4
  351. VPADDD Z27, Z4, Z4
  352. VPXORD Z28, Z4, Z28
  353. VPRORD $0x10, Z28, Z28
  354. VPADDD Z20, Z28, Z20
  355. VPXORD Z12, Z20, Z12
  356. VPRORD $0x0c, Z12, Z12
  357. VPADDD Z4, Z12, Z4
  358. VPADDD Z5, Z4, Z4
  359. VPXORD Z28, Z4, Z28
  360. VPRORD $0x08, Z28, Z28
  361. VPADDD Z20, Z28, Z20
  362. VPXORD Z12, Z20, Z12
  363. VPRORD $0x07, Z12, Z12
  364. VPADDD Z6, Z14, Z6
  365. VPADDD Z15, Z6, Z6
  366. VPXORD Z30, Z6, Z30
  367. VPRORD $0x10, Z30, Z30
  368. VPADDD Z22, Z30, Z22
  369. VPXORD Z14, Z22, Z14
  370. VPRORD $0x0c, Z14, Z14
  371. VPADDD Z6, Z14, Z6
  372. VPADDD Z29, Z6, Z6
  373. VPXORD Z30, Z6, Z30
  374. VPRORD $0x08, Z30, Z30
  375. VPADDD Z22, Z30, Z22
  376. VPXORD Z14, Z22, Z14
  377. VPRORD $0x07, Z14, Z14
  378. VPADDD Z0, Z10, Z0
  379. VPADDD Z13, Z0, Z0
  380. VPXORD Z30, Z0, Z30
  381. VPRORD $0x10, Z30, Z30
  382. VPADDD Z20, Z30, Z20
  383. VPXORD Z10, Z20, Z10
  384. VPRORD $0x0c, Z10, Z10
  385. VPADDD Z0, Z10, Z0
  386. VPADDD Z11, Z0, Z0
  387. VPXORD Z30, Z0, Z30
  388. VPRORD $0x08, Z30, Z30
  389. VPADDD Z20, Z30, Z20
  390. VPXORD Z10, Z20, Z10
  391. VPRORD $0x07, Z10, Z10
  392. VPADDD Z2, Z12, Z2
  393. VPADDD Z19, Z2, Z2
  394. VPXORD Z24, Z2, Z24
  395. VPRORD $0x10, Z24, Z24
  396. VPADDD Z22, Z24, Z22
  397. VPXORD Z12, Z22, Z12
  398. VPRORD $0x0c, Z12, Z12
  399. VPADDD Z2, Z12, Z2
  400. VPADDD Z1, Z2, Z2
  401. VPXORD Z24, Z2, Z24
  402. VPRORD $0x08, Z24, Z24
  403. VPADDD Z22, Z24, Z22
  404. VPXORD Z12, Z22, Z12
  405. VPRORD $0x07, Z12, Z12
  406. VPADDD Z4, Z14, Z4
  407. VPADDD Z23, Z4, Z4
  408. VPXORD Z26, Z4, Z26
  409. VPRORD $0x10, Z26, Z26
  410. VPADDD Z16, Z26, Z16
  411. VPXORD Z14, Z16, Z14
  412. VPRORD $0x0c, Z14, Z14
  413. VPADDD Z4, Z14, Z4
  414. VPADDD Z31, Z4, Z4
  415. VPXORD Z26, Z4, Z26
  416. VPRORD $0x08, Z26, Z26
  417. VPADDD Z16, Z26, Z16
  418. VPXORD Z14, Z16, Z14
  419. VPRORD $0x07, Z14, Z14
  420. VPADDD Z6, Z8, Z6
  421. VPADDD Z17, Z6, Z6
  422. VPXORD Z28, Z6, Z28
  423. VPRORD $0x10, Z28, Z28
  424. VPADDD Z18, Z28, Z18
  425. VPXORD Z8, Z18, Z8
  426. VPRORD $0x0c, Z8, Z8
  427. VPADDD Z6, Z8, Z6
  428. VPADDD Z3, Z6, Z6
  429. VPXORD Z28, Z6, Z28
  430. VPRORD $0x08, Z28, Z28
  431. VPADDD Z18, Z28, Z18
  432. VPXORD Z8, Z18, Z8
  433. VPRORD $0x07, Z8, Z8
  434. // Round 4
  435. VPADDD Z0, Z8, Z0
  436. VPADDD Z21, Z0, Z0
  437. VPXORD Z24, Z0, Z24
  438. VPRORD $0x10, Z24, Z24
  439. VPADDD Z16, Z24, Z16
  440. VPXORD Z8, Z16, Z8
  441. VPRORD $0x0c, Z8, Z8
  442. VPADDD Z0, Z8, Z0
  443. VPADDD Z15, Z0, Z0
  444. VPXORD Z24, Z0, Z24
  445. VPRORD $0x08, Z24, Z24
  446. VPADDD Z16, Z24, Z16
  447. VPXORD Z8, Z16, Z8
  448. VPRORD $0x07, Z8, Z8
  449. VPADDD Z2, Z10, Z2
  450. VPADDD Z25, Z2, Z2
  451. VPXORD Z26, Z2, Z26
  452. VPRORD $0x10, Z26, Z26
  453. VPADDD Z18, Z26, Z18
  454. VPXORD Z10, Z18, Z10
  455. VPRORD $0x0c, Z10, Z10
  456. VPADDD Z2, Z10, Z2
  457. VPADDD Z19, Z2, Z2
  458. VPXORD Z26, Z2, Z26
  459. VPRORD $0x08, Z26, Z26
  460. VPADDD Z18, Z26, Z18
  461. VPXORD Z10, Z18, Z10
  462. VPRORD $0x07, Z10, Z10
  463. VPADDD Z4, Z12, Z4
  464. VPADDD Z29, Z4, Z4
  465. VPXORD Z28, Z4, Z28
  466. VPRORD $0x10, Z28, Z28
  467. VPADDD Z20, Z28, Z20
  468. VPXORD Z12, Z20, Z12
  469. VPRORD $0x0c, Z12, Z12
  470. VPADDD Z4, Z12, Z4
  471. VPADDD Z7, Z4, Z4
  472. VPXORD Z28, Z4, Z28
  473. VPRORD $0x08, Z28, Z28
  474. VPADDD Z20, Z28, Z20
  475. VPXORD Z12, Z20, Z12
  476. VPRORD $0x07, Z12, Z12
  477. VPADDD Z6, Z14, Z6
  478. VPADDD Z27, Z6, Z6
  479. VPXORD Z30, Z6, Z30
  480. VPRORD $0x10, Z30, Z30
  481. VPADDD Z22, Z30, Z22
  482. VPXORD Z14, Z22, Z14
  483. VPRORD $0x0c, Z14, Z14
  484. VPADDD Z6, Z14, Z6
  485. VPADDD Z31, Z6, Z6
  486. VPXORD Z30, Z6, Z30
  487. VPRORD $0x08, Z30, Z30
  488. VPADDD Z22, Z30, Z22
  489. VPXORD Z14, Z22, Z14
  490. VPRORD $0x07, Z14, Z14
  491. VPADDD Z0, Z10, Z0
  492. VPADDD Z9, Z0, Z0
  493. VPXORD Z30, Z0, Z30
  494. VPRORD $0x10, Z30, Z30
  495. VPADDD Z20, Z30, Z20
  496. VPXORD Z10, Z20, Z10
  497. VPRORD $0x0c, Z10, Z10
  498. VPADDD Z0, Z10, Z0
  499. VPADDD Z1, Z0, Z0
  500. VPXORD Z30, Z0, Z30
  501. VPRORD $0x08, Z30, Z30
  502. VPADDD Z20, Z30, Z20
  503. VPXORD Z10, Z20, Z10
  504. VPRORD $0x07, Z10, Z10
  505. VPADDD Z2, Z12, Z2
  506. VPADDD Z23, Z2, Z2
  507. VPXORD Z24, Z2, Z24
  508. VPRORD $0x10, Z24, Z24
  509. VPADDD Z22, Z24, Z22
  510. VPXORD Z12, Z22, Z12
  511. VPRORD $0x0c, Z12, Z12
  512. VPADDD Z2, Z12, Z2
  513. VPADDD Z5, Z2, Z2
  514. VPXORD Z24, Z2, Z24
  515. VPRORD $0x08, Z24, Z24
  516. VPADDD Z22, Z24, Z22
  517. VPXORD Z12, Z22, Z12
  518. VPRORD $0x07, Z12, Z12
  519. VPADDD Z4, Z14, Z4
  520. VPADDD Z11, Z4, Z4
  521. VPXORD Z26, Z4, Z26
  522. VPRORD $0x10, Z26, Z26
  523. VPADDD Z16, Z26, Z16
  524. VPXORD Z14, Z16, Z14
  525. VPRORD $0x0c, Z14, Z14
  526. VPADDD Z4, Z14, Z4
  527. VPADDD Z17, Z4, Z4
  528. VPXORD Z26, Z4, Z26
  529. VPRORD $0x08, Z26, Z26
  530. VPADDD Z16, Z26, Z16
  531. VPXORD Z14, Z16, Z14
  532. VPRORD $0x07, Z14, Z14
  533. VPADDD Z6, Z8, Z6
  534. VPADDD Z3, Z6, Z6
  535. VPXORD Z28, Z6, Z28
  536. VPRORD $0x10, Z28, Z28
  537. VPADDD Z18, Z28, Z18
  538. VPXORD Z8, Z18, Z8
  539. VPRORD $0x0c, Z8, Z8
  540. VPADDD Z6, Z8, Z6
  541. VPADDD Z13, Z6, Z6
  542. VPXORD Z28, Z6, Z28
  543. VPRORD $0x08, Z28, Z28
  544. VPADDD Z18, Z28, Z18
  545. VPXORD Z8, Z18, Z8
  546. VPRORD $0x07, Z8, Z8
  547. // Round 5
  548. VPADDD Z0, Z8, Z0
  549. VPADDD Z25, Z0, Z0
  550. VPXORD Z24, Z0, Z24
  551. VPRORD $0x10, Z24, Z24
  552. VPADDD Z16, Z24, Z16
  553. VPXORD Z8, Z16, Z8
  554. VPRORD $0x0c, Z8, Z8
  555. VPADDD Z0, Z8, Z0
  556. VPADDD Z27, Z0, Z0
  557. VPXORD Z24, Z0, Z24
  558. VPRORD $0x08, Z24, Z24
  559. VPADDD Z16, Z24, Z16
  560. VPXORD Z8, Z16, Z8
  561. VPRORD $0x07, Z8, Z8
  562. VPADDD Z2, Z10, Z2
  563. VPADDD Z19, Z2, Z2
  564. VPXORD Z26, Z2, Z26
  565. VPRORD $0x10, Z26, Z26
  566. VPADDD Z18, Z26, Z18
  567. VPXORD Z10, Z18, Z10
  568. VPRORD $0x0c, Z10, Z10
  569. VPADDD Z2, Z10, Z2
  570. VPADDD Z23, Z2, Z2
  571. VPXORD Z26, Z2, Z26
  572. VPRORD $0x08, Z26, Z26
  573. VPADDD Z18, Z26, Z18
  574. VPXORD Z10, Z18, Z10
  575. VPRORD $0x07, Z10, Z10
  576. VPADDD Z4, Z12, Z4
  577. VPADDD Z31, Z4, Z4
  578. VPXORD Z28, Z4, Z28
  579. VPRORD $0x10, Z28, Z28
  580. VPADDD Z20, Z28, Z20
  581. VPXORD Z12, Z20, Z12
  582. VPRORD $0x0c, Z12, Z12
  583. VPADDD Z4, Z12, Z4
  584. VPADDD Z21, Z4, Z4
  585. VPXORD Z28, Z4, Z28
  586. VPRORD $0x08, Z28, Z28
  587. VPADDD Z20, Z28, Z20
  588. VPXORD Z12, Z20, Z12
  589. VPRORD $0x07, Z12, Z12
  590. VPADDD Z6, Z14, Z6
  591. VPADDD Z29, Z6, Z6
  592. VPXORD Z30, Z6, Z30
  593. VPRORD $0x10, Z30, Z30
  594. VPADDD Z22, Z30, Z22
  595. VPXORD Z14, Z22, Z14
  596. VPRORD $0x0c, Z14, Z14
  597. VPADDD Z6, Z14, Z6
  598. VPADDD Z17, Z6, Z6
  599. VPXORD Z30, Z6, Z30
  600. VPRORD $0x08, Z30, Z30
  601. VPADDD Z22, Z30, Z22
  602. VPXORD Z14, Z22, Z14
  603. VPRORD $0x07, Z14, Z14
  604. VPADDD Z0, Z10, Z0
  605. VPADDD Z15, Z0, Z0
  606. VPXORD Z30, Z0, Z30
  607. VPRORD $0x10, Z30, Z30
  608. VPADDD Z20, Z30, Z20
  609. VPXORD Z10, Z20, Z10
  610. VPRORD $0x0c, Z10, Z10
  611. VPADDD Z0, Z10, Z0
  612. VPADDD Z5, Z0, Z0
  613. VPXORD Z30, Z0, Z30
  614. VPRORD $0x08, Z30, Z30
  615. VPADDD Z20, Z30, Z20
  616. VPXORD Z10, Z20, Z10
  617. VPRORD $0x07, Z10, Z10
  618. VPADDD Z2, Z12, Z2
  619. VPADDD Z11, Z2, Z2
  620. VPXORD Z24, Z2, Z24
  621. VPRORD $0x10, Z24, Z24
  622. VPADDD Z22, Z24, Z22
  623. VPXORD Z12, Z22, Z12
  624. VPRORD $0x0c, Z12, Z12
  625. VPADDD Z2, Z12, Z2
  626. VPADDD Z7, Z2, Z2
  627. VPXORD Z24, Z2, Z24
  628. VPRORD $0x08, Z24, Z24
  629. VPADDD Z22, Z24, Z22
  630. VPXORD Z12, Z22, Z12
  631. VPRORD $0x07, Z12, Z12
  632. VPADDD Z4, Z14, Z4
  633. VPADDD Z1, Z4, Z4
  634. VPXORD Z26, Z4, Z26
  635. VPRORD $0x10, Z26, Z26
  636. VPADDD Z16, Z26, Z16
  637. VPXORD Z14, Z16, Z14
  638. VPRORD $0x0c, Z14, Z14
  639. VPADDD Z4, Z14, Z4
  640. VPADDD Z3, Z4, Z4
  641. VPXORD Z26, Z4, Z26
  642. VPRORD $0x08, Z26, Z26
  643. VPADDD Z16, Z26, Z16
  644. VPXORD Z14, Z16, Z14
  645. VPRORD $0x07, Z14, Z14
  646. VPADDD Z6, Z8, Z6
  647. VPADDD Z13, Z6, Z6
  648. VPXORD Z28, Z6, Z28
  649. VPRORD $0x10, Z28, Z28
  650. VPADDD Z18, Z28, Z18
  651. VPXORD Z8, Z18, Z8
  652. VPRORD $0x0c, Z8, Z8
  653. VPADDD Z6, Z8, Z6
  654. VPADDD Z9, Z6, Z6
  655. VPXORD Z28, Z6, Z28
  656. VPRORD $0x08, Z28, Z28
  657. VPADDD Z18, Z28, Z18
  658. VPXORD Z8, Z18, Z8
  659. VPRORD $0x07, Z8, Z8
  660. // Round 6
  661. VPADDD Z0, Z8, Z0
  662. VPADDD Z19, Z0, Z0
  663. VPXORD Z24, Z0, Z24
  664. VPRORD $0x10, Z24, Z24
  665. VPADDD Z16, Z24, Z16
  666. VPXORD Z8, Z16, Z8
  667. VPRORD $0x0c, Z8, Z8
  668. VPADDD Z0, Z8, Z0
  669. VPADDD Z29, Z0, Z0
  670. VPXORD Z24, Z0, Z24
  671. VPRORD $0x08, Z24, Z24
  672. VPADDD Z16, Z24, Z16
  673. VPXORD Z8, Z16, Z8
  674. VPRORD $0x07, Z8, Z8
  675. VPADDD Z2, Z10, Z2
  676. VPADDD Z23, Z2, Z2
  677. VPXORD Z26, Z2, Z26
  678. VPRORD $0x10, Z26, Z26
  679. VPADDD Z18, Z26, Z18
  680. VPXORD Z10, Z18, Z10
  681. VPRORD $0x0c, Z10, Z10
  682. VPADDD Z2, Z10, Z2
  683. VPADDD Z11, Z2, Z2
  684. VPXORD Z26, Z2, Z26
  685. VPRORD $0x08, Z26, Z26
  686. VPADDD Z18, Z26, Z18
  687. VPXORD Z10, Z18, Z10
  688. VPRORD $0x07, Z10, Z10
  689. VPADDD Z4, Z12, Z4
  690. VPADDD Z17, Z4, Z4
  691. VPXORD Z28, Z4, Z28
  692. VPRORD $0x10, Z28, Z28
  693. VPADDD Z20, Z28, Z20
  694. VPXORD Z12, Z20, Z12
  695. VPRORD $0x0c, Z12, Z12
  696. VPADDD Z4, Z12, Z4
  697. VPADDD Z25, Z4, Z4
  698. VPXORD Z28, Z4, Z28
  699. VPRORD $0x08, Z28, Z28
  700. VPADDD Z20, Z28, Z20
  701. VPXORD Z12, Z20, Z12
  702. VPRORD $0x07, Z12, Z12
  703. VPADDD Z6, Z14, Z6
  704. VPADDD Z31, Z6, Z6
  705. VPXORD Z30, Z6, Z30
  706. VPRORD $0x10, Z30, Z30
  707. VPADDD Z22, Z30, Z22
  708. VPXORD Z14, Z22, Z14
  709. VPRORD $0x0c, Z14, Z14
  710. VPADDD Z6, Z14, Z6
  711. VPADDD Z3, Z6, Z6
  712. VPXORD Z30, Z6, Z30
  713. VPRORD $0x08, Z30, Z30
  714. VPADDD Z22, Z30, Z22
  715. VPXORD Z14, Z22, Z14
  716. VPRORD $0x07, Z14, Z14
  717. VPADDD Z0, Z10, Z0
  718. VPADDD Z27, Z0, Z0
  719. VPXORD Z30, Z0, Z30
  720. VPRORD $0x10, Z30, Z30
  721. VPADDD Z20, Z30, Z20
  722. VPXORD Z10, Z20, Z10
  723. VPRORD $0x0c, Z10, Z10
  724. VPADDD Z0, Z10, Z0
  725. VPADDD Z7, Z0, Z0
  726. VPXORD Z30, Z0, Z30
  727. VPRORD $0x08, Z30, Z30
  728. VPADDD Z20, Z30, Z20
  729. VPXORD Z10, Z20, Z10
  730. VPRORD $0x07, Z10, Z10
  731. VPADDD Z2, Z12, Z2
  732. VPADDD Z1, Z2, Z2
  733. VPXORD Z24, Z2, Z24
  734. VPRORD $0x10, Z24, Z24
  735. VPADDD Z22, Z24, Z22
  736. VPXORD Z12, Z22, Z12
  737. VPRORD $0x0c, Z12, Z12
  738. VPADDD Z2, Z12, Z2
  739. VPADDD Z21, Z2, Z2
  740. VPXORD Z24, Z2, Z24
  741. VPRORD $0x08, Z24, Z24
  742. VPADDD Z22, Z24, Z22
  743. VPXORD Z12, Z22, Z12
  744. VPRORD $0x07, Z12, Z12
  745. VPADDD Z4, Z14, Z4
  746. VPADDD Z5, Z4, Z4
  747. VPXORD Z26, Z4, Z26
  748. VPRORD $0x10, Z26, Z26
  749. VPADDD Z16, Z26, Z16
  750. VPXORD Z14, Z16, Z14
  751. VPRORD $0x0c, Z14, Z14
  752. VPADDD Z4, Z14, Z4
  753. VPADDD Z13, Z4, Z4
  754. VPXORD Z26, Z4, Z26
  755. VPRORD $0x08, Z26, Z26
  756. VPADDD Z16, Z26, Z16
  757. VPXORD Z14, Z16, Z14
  758. VPRORD $0x07, Z14, Z14
  759. VPADDD Z6, Z8, Z6
  760. VPADDD Z9, Z6, Z6
  761. VPXORD Z28, Z6, Z28
  762. VPRORD $0x10, Z28, Z28
  763. VPADDD Z18, Z28, Z18
  764. VPXORD Z8, Z18, Z8
  765. VPRORD $0x0c, Z8, Z8
  766. VPADDD Z6, Z8, Z6
  767. VPADDD Z15, Z6, Z6
  768. VPXORD Z28, Z6, Z28
  769. VPRORD $0x08, Z28, Z28
  770. VPADDD Z18, Z28, Z18
  771. VPXORD Z8, Z18, Z8
  772. VPRORD $0x07, Z8, Z8
  773. // Round 7
  774. VPADDD Z0, Z8, Z0
  775. VPADDD Z23, Z0, Z0
  776. VPXORD Z24, Z0, Z24
  777. VPRORD $0x10, Z24, Z24
  778. VPADDD Z16, Z24, Z16
  779. VPXORD Z8, Z16, Z8
  780. VPRORD $0x0c, Z8, Z8
  781. VPADDD Z0, Z8, Z0
  782. VPADDD Z31, Z0, Z0
  783. VPXORD Z24, Z0, Z24
  784. VPRORD $0x08, Z24, Z24
  785. VPADDD Z16, Z24, Z16
  786. VPXORD Z8, Z16, Z8
  787. VPRORD $0x07, Z8, Z8
  788. VPADDD Z2, Z10, Z2
  789. VPADDD Z11, Z2, Z2
  790. VPXORD Z26, Z2, Z26
  791. VPRORD $0x10, Z26, Z26
  792. VPADDD Z18, Z26, Z18
  793. VPXORD Z10, Z18, Z10
  794. VPRORD $0x0c, Z10, Z10
  795. VPADDD Z2, Z10, Z2
  796. VPADDD Z1, Z2, Z2
  797. VPXORD Z26, Z2, Z26
  798. VPRORD $0x08, Z26, Z26
  799. VPADDD Z18, Z26, Z18
  800. VPXORD Z10, Z18, Z10
  801. VPRORD $0x07, Z10, Z10
  802. VPADDD Z4, Z12, Z4
  803. VPADDD Z3, Z4, Z4
  804. VPXORD Z28, Z4, Z28
  805. VPRORD $0x10, Z28, Z28
  806. VPADDD Z20, Z28, Z20
  807. VPXORD Z12, Z20, Z12
  808. VPRORD $0x0c, Z12, Z12
  809. VPADDD Z4, Z12, Z4
  810. VPADDD Z19, Z4, Z4
  811. VPXORD Z28, Z4, Z28
  812. VPRORD $0x08, Z28, Z28
  813. VPADDD Z20, Z28, Z20
  814. VPXORD Z12, Z20, Z12
  815. VPRORD $0x07, Z12, Z12
  816. VPADDD Z6, Z14, Z6
  817. VPADDD Z17, Z6, Z6
  818. VPXORD Z30, Z6, Z30
  819. VPRORD $0x10, Z30, Z30
  820. VPADDD Z22, Z30, Z22
  821. VPXORD Z14, Z22, Z14
  822. VPRORD $0x0c, Z14, Z14
  823. VPADDD Z6, Z14, Z6
  824. VPADDD Z13, Z6, Z6
  825. VPXORD Z30, Z6, Z30
  826. VPRORD $0x08, Z30, Z30
  827. VPADDD Z22, Z30, Z22
  828. VPXORD Z14, Z22, Z14
  829. VPRORD $0x07, Z14, Z14
  830. VPADDD Z0, Z10, Z0
  831. VPADDD Z29, Z0, Z0
  832. VPXORD Z30, Z0, Z30
  833. VPRORD $0x10, Z30, Z30
  834. VPADDD Z20, Z30, Z20
  835. VPXORD Z10, Z20, Z10
  836. VPRORD $0x0c, Z10, Z10
  837. VPADDD Z0, Z10, Z0
  838. VPADDD Z21, Z0, Z0
  839. VPXORD Z30, Z0, Z30
  840. VPRORD $0x08, Z30, Z30
  841. VPADDD Z20, Z30, Z20
  842. VPXORD Z10, Z20, Z10
  843. VPRORD $0x07, Z10, Z10
  844. VPADDD Z2, Z12, Z2
  845. VPADDD Z5, Z2, Z2
  846. VPXORD Z24, Z2, Z24
  847. VPRORD $0x10, Z24, Z24
  848. VPADDD Z22, Z24, Z22
  849. VPXORD Z12, Z22, Z12
  850. VPRORD $0x0c, Z12, Z12
  851. VPADDD Z2, Z12, Z2
  852. VPADDD Z25, Z2, Z2
  853. VPXORD Z24, Z2, Z24
  854. VPRORD $0x08, Z24, Z24
  855. VPADDD Z22, Z24, Z22
  856. VPXORD Z12, Z22, Z12
  857. VPRORD $0x07, Z12, Z12
  858. VPADDD Z4, Z14, Z4
  859. VPADDD Z7, Z4, Z4
  860. VPXORD Z26, Z4, Z26
  861. VPRORD $0x10, Z26, Z26
  862. VPADDD Z16, Z26, Z16
  863. VPXORD Z14, Z16, Z14
  864. VPRORD $0x0c, Z14, Z14
  865. VPADDD Z4, Z14, Z4
  866. VPADDD Z9, Z4, Z4
  867. VPXORD Z26, Z4, Z26
  868. VPRORD $0x08, Z26, Z26
  869. VPADDD Z16, Z26, Z16
  870. VPXORD Z14, Z16, Z14
  871. VPRORD $0x07, Z14, Z14
  872. VPADDD Z6, Z8, Z6
  873. VPADDD Z15, Z6, Z6
  874. VPXORD Z28, Z6, Z28
  875. VPRORD $0x10, Z28, Z28
  876. VPADDD Z18, Z28, Z18
  877. VPXORD Z8, Z18, Z8
  878. VPRORD $0x0c, Z8, Z8
  879. VPADDD Z6, Z8, Z6
  880. VPADDD Z27, Z6, Z6
  881. VPXORD Z28, Z6, Z28
  882. VPRORD $0x08, Z28, Z28
  883. VPADDD Z18, Z28, Z18
  884. VPXORD Z8, Z18, Z8
  885. VPRORD $0x07, Z8, Z8
  886. // Finalize CVs
  887. VPXORD Z0, Z16, Z0
  888. VPXORD Z2, Z18, Z2
  889. VPXORD Z4, Z20, Z4
  890. VPXORD Z6, Z22, Z6
  891. VPXORD Z8, Z24, Z8
  892. VPXORD Z10, Z26, Z10
  893. VPXORD Z12, Z28, Z12
  894. VPXORD Z14, Z30, Z14
  895. VPXORD.BCST (DX), Z16, Z16
  896. VPXORD.BCST 4(DX), Z18, Z18
  897. VPXORD.BCST 8(DX), Z20, Z20
  898. VPXORD.BCST 12(DX), Z22, Z22
  899. VPXORD.BCST 16(DX), Z24, Z24
  900. VPXORD.BCST 20(DX), Z26, Z26
  901. VPXORD.BCST 24(DX), Z28, Z28
  902. VPXORD.BCST 28(DX), Z30, Z30
  903. VMOVDQU32 seq<>+0(SB), Z1
  904. VPSLLD $0x06, Z1, Z1
  905. KXNORD K1, K1, K1
  906. VPSCATTERDD Z0, K1, (AX)(Z1*1)
  907. KXNORD K1, K1, K1
  908. VPSCATTERDD Z2, K1, 4(AX)(Z1*1)
  909. KXNORD K1, K1, K1
  910. VPSCATTERDD Z4, K1, 8(AX)(Z1*1)
  911. KXNORD K1, K1, K1
  912. VPSCATTERDD Z6, K1, 12(AX)(Z1*1)
  913. KXNORD K1, K1, K1
  914. VPSCATTERDD Z8, K1, 16(AX)(Z1*1)
  915. KXNORD K1, K1, K1
  916. VPSCATTERDD Z10, K1, 20(AX)(Z1*1)
  917. KXNORD K1, K1, K1
  918. VPSCATTERDD Z12, K1, 24(AX)(Z1*1)
  919. KXNORD K1, K1, K1
  920. VPSCATTERDD Z14, K1, 28(AX)(Z1*1)
  921. KXNORD K1, K1, K1
  922. VPSCATTERDD Z16, K1, 32(AX)(Z1*1)
  923. KXNORD K1, K1, K1
  924. VPSCATTERDD Z18, K1, 36(AX)(Z1*1)
  925. KXNORD K1, K1, K1
  926. VPSCATTERDD Z20, K1, 40(AX)(Z1*1)
  927. KXNORD K1, K1, K1
  928. VPSCATTERDD Z22, K1, 44(AX)(Z1*1)
  929. KXNORD K1, K1, K1
  930. VPSCATTERDD Z24, K1, 48(AX)(Z1*1)
  931. KXNORD K1, K1, K1
  932. VPSCATTERDD Z26, K1, 52(AX)(Z1*1)
  933. KXNORD K1, K1, K1
  934. VPSCATTERDD Z28, K1, 56(AX)(Z1*1)
  935. KXNORD K1, K1, K1
  936. VPSCATTERDD Z30, K1, 60(AX)(Z1*1)
  937. RET
  938. // func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)
  939. // Requires: AVX512F
  940. TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-40
  941. MOVQ cvs+0(FP), AX
  942. MOVQ buf+8(FP), CX
  943. MOVQ key+16(FP), DX
  944. // Initialize counter
  945. VPBROADCASTD counter+24(FP), Z0
  946. VPADDD seq<>+0(SB), Z0, Z0
  947. VPCMPUD $0x01, seq<>+0(SB), Z0, K1
  948. VPBROADCASTD counter+28(FP), Z2
  949. VPADDD.BCST seq<>+4(SB), Z2, K1, Z2
  950. VMOVDQU32 Z0, (SP)
  951. VMOVDQU32 Z2, 64(SP)
  952. // Initialize flags
  953. VPBROADCASTD flags+32(FP), Z0
  954. VMOVDQU32 Z0, 128(SP)
  955. ORL $0x01, 128(SP)
  956. ORL $0x02, 188(SP)
  957. // Load key
  958. VPBROADCASTD (DX), Z0
  959. VPBROADCASTD 4(DX), Z2
  960. VPBROADCASTD 8(DX), Z4
  961. VPBROADCASTD 12(DX), Z6
  962. VPBROADCASTD 16(DX), Z8
  963. VPBROADCASTD 20(DX), Z10
  964. VPBROADCASTD 24(DX), Z12
  965. VPBROADCASTD 28(DX), Z14
  966. // Loop index
  967. XORQ DX, DX
  968. loop:
  969. // Load transposed block
  970. VMOVDQU32 seq<>+0(SB), Z16
  971. VPSLLD $0x0a, Z16, Z16
  972. KXNORD K1, K1, K1
  973. VPGATHERDD (CX)(Z16*1), K1, Z1
  974. KXNORD K1, K1, K1
  975. VPGATHERDD 4(CX)(Z16*1), K1, Z3
  976. KXNORD K1, K1, K1
  977. VPGATHERDD 8(CX)(Z16*1), K1, Z5
  978. KXNORD K1, K1, K1
  979. VPGATHERDD 12(CX)(Z16*1), K1, Z7
  980. KXNORD K1, K1, K1
  981. VPGATHERDD 16(CX)(Z16*1), K1, Z9
  982. KXNORD K1, K1, K1
  983. VPGATHERDD 20(CX)(Z16*1), K1, Z11
  984. KXNORD K1, K1, K1
  985. VPGATHERDD 24(CX)(Z16*1), K1, Z13
  986. KXNORD K1, K1, K1
  987. VPGATHERDD 28(CX)(Z16*1), K1, Z15
  988. KXNORD K1, K1, K1
  989. VPGATHERDD 32(CX)(Z16*1), K1, Z17
  990. KXNORD K1, K1, K1
  991. VPGATHERDD 36(CX)(Z16*1), K1, Z19
  992. KXNORD K1, K1, K1
  993. VPGATHERDD 40(CX)(Z16*1), K1, Z21
  994. KXNORD K1, K1, K1
  995. VPGATHERDD 44(CX)(Z16*1), K1, Z23
  996. KXNORD K1, K1, K1
  997. VPGATHERDD 48(CX)(Z16*1), K1, Z25
  998. KXNORD K1, K1, K1
  999. VPGATHERDD 52(CX)(Z16*1), K1, Z27
  1000. KXNORD K1, K1, K1
  1001. VPGATHERDD 56(CX)(Z16*1), K1, Z29
  1002. KXNORD K1, K1, K1
  1003. VPGATHERDD 60(CX)(Z16*1), K1, Z31
  1004. ADDQ $0x40, CX
  1005. // Reload state vectors (other than CVs)
  1006. VPBROADCASTD iv<>+0(SB), Z16
  1007. VPBROADCASTD iv<>+4(SB), Z18
  1008. VPBROADCASTD iv<>+8(SB), Z20
  1009. VPBROADCASTD iv<>+12(SB), Z22
  1010. VMOVDQU32 (SP), Z24
  1011. VMOVDQU32 64(SP), Z26
  1012. VPBROADCASTD seq<>+4(SB), Z28
  1013. VPSLLD $0x06, Z28, Z28
  1014. VPBROADCASTD 128(SP)(DX*4), Z30
  1015. // Round 1
  1016. VPADDD Z0, Z8, Z0
  1017. VPADDD Z1, Z0, Z0
  1018. VPXORD Z24, Z0, Z24
  1019. VPRORD $0x10, Z24, Z24
  1020. VPADDD Z16, Z24, Z16
  1021. VPXORD Z8, Z16, Z8
  1022. VPRORD $0x0c, Z8, Z8
  1023. VPADDD Z0, Z8, Z0
  1024. VPADDD Z3, Z0, Z0
  1025. VPXORD Z24, Z0, Z24
  1026. VPRORD $0x08, Z24, Z24
  1027. VPADDD Z16, Z24, Z16
  1028. VPXORD Z8, Z16, Z8
  1029. VPRORD $0x07, Z8, Z8
  1030. VPADDD Z2, Z10, Z2
  1031. VPADDD Z5, Z2, Z2
  1032. VPXORD Z26, Z2, Z26
  1033. VPRORD $0x10, Z26, Z26
  1034. VPADDD Z18, Z26, Z18
  1035. VPXORD Z10, Z18, Z10
  1036. VPRORD $0x0c, Z10, Z10
  1037. VPADDD Z2, Z10, Z2
  1038. VPADDD Z7, Z2, Z2
  1039. VPXORD Z26, Z2, Z26
  1040. VPRORD $0x08, Z26, Z26
  1041. VPADDD Z18, Z26, Z18
  1042. VPXORD Z10, Z18, Z10
  1043. VPRORD $0x07, Z10, Z10
  1044. VPADDD Z4, Z12, Z4
  1045. VPADDD Z9, Z4, Z4
  1046. VPXORD Z28, Z4, Z28
  1047. VPRORD $0x10, Z28, Z28
  1048. VPADDD Z20, Z28, Z20
  1049. VPXORD Z12, Z20, Z12
  1050. VPRORD $0x0c, Z12, Z12
  1051. VPADDD Z4, Z12, Z4
  1052. VPADDD Z11, Z4, Z4
  1053. VPXORD Z28, Z4, Z28
  1054. VPRORD $0x08, Z28, Z28
  1055. VPADDD Z20, Z28, Z20
  1056. VPXORD Z12, Z20, Z12
  1057. VPRORD $0x07, Z12, Z12
  1058. VPADDD Z6, Z14, Z6
  1059. VPADDD Z13, Z6, Z6
  1060. VPXORD Z30, Z6, Z30
  1061. VPRORD $0x10, Z30, Z30
  1062. VPADDD Z22, Z30, Z22
  1063. VPXORD Z14, Z22, Z14
  1064. VPRORD $0x0c, Z14, Z14
  1065. VPADDD Z6, Z14, Z6
  1066. VPADDD Z15, Z6, Z6
  1067. VPXORD Z30, Z6, Z30
  1068. VPRORD $0x08, Z30, Z30
  1069. VPADDD Z22, Z30, Z22
  1070. VPXORD Z14, Z22, Z14
  1071. VPRORD $0x07, Z14, Z14
  1072. VPADDD Z0, Z10, Z0
  1073. VPADDD Z17, Z0, Z0
  1074. VPXORD Z30, Z0, Z30
  1075. VPRORD $0x10, Z30, Z30
  1076. VPADDD Z20, Z30, Z20
  1077. VPXORD Z10, Z20, Z10
  1078. VPRORD $0x0c, Z10, Z10
  1079. VPADDD Z0, Z10, Z0
  1080. VPADDD Z19, Z0, Z0
  1081. VPXORD Z30, Z0, Z30
  1082. VPRORD $0x08, Z30, Z30
  1083. VPADDD Z20, Z30, Z20
  1084. VPXORD Z10, Z20, Z10
  1085. VPRORD $0x07, Z10, Z10
  1086. VPADDD Z2, Z12, Z2
  1087. VPADDD Z21, Z2, Z2
  1088. VPXORD Z24, Z2, Z24
  1089. VPRORD $0x10, Z24, Z24
  1090. VPADDD Z22, Z24, Z22
  1091. VPXORD Z12, Z22, Z12
  1092. VPRORD $0x0c, Z12, Z12
  1093. VPADDD Z2, Z12, Z2
  1094. VPADDD Z23, Z2, Z2
  1095. VPXORD Z24, Z2, Z24
  1096. VPRORD $0x08, Z24, Z24
  1097. VPADDD Z22, Z24, Z22
  1098. VPXORD Z12, Z22, Z12
  1099. VPRORD $0x07, Z12, Z12
  1100. VPADDD Z4, Z14, Z4
  1101. VPADDD Z25, Z4, Z4
  1102. VPXORD Z26, Z4, Z26
  1103. VPRORD $0x10, Z26, Z26
  1104. VPADDD Z16, Z26, Z16
  1105. VPXORD Z14, Z16, Z14
  1106. VPRORD $0x0c, Z14, Z14
  1107. VPADDD Z4, Z14, Z4
  1108. VPADDD Z27, Z4, Z4
  1109. VPXORD Z26, Z4, Z26
  1110. VPRORD $0x08, Z26, Z26
  1111. VPADDD Z16, Z26, Z16
  1112. VPXORD Z14, Z16, Z14
  1113. VPRORD $0x07, Z14, Z14
  1114. VPADDD Z6, Z8, Z6
  1115. VPADDD Z29, Z6, Z6
  1116. VPXORD Z28, Z6, Z28
  1117. VPRORD $0x10, Z28, Z28
  1118. VPADDD Z18, Z28, Z18
  1119. VPXORD Z8, Z18, Z8
  1120. VPRORD $0x0c, Z8, Z8
  1121. VPADDD Z6, Z8, Z6
  1122. VPADDD Z31, Z6, Z6
  1123. VPXORD Z28, Z6, Z28
  1124. VPRORD $0x08, Z28, Z28
  1125. VPADDD Z18, Z28, Z18
  1126. VPXORD Z8, Z18, Z8
  1127. VPRORD $0x07, Z8, Z8
  1128. // Round 2
  1129. VPADDD Z0, Z8, Z0
  1130. VPADDD Z5, Z0, Z0
  1131. VPXORD Z24, Z0, Z24
  1132. VPRORD $0x10, Z24, Z24
  1133. VPADDD Z16, Z24, Z16
  1134. VPXORD Z8, Z16, Z8
  1135. VPRORD $0x0c, Z8, Z8
  1136. VPADDD Z0, Z8, Z0
  1137. VPADDD Z13, Z0, Z0
  1138. VPXORD Z24, Z0, Z24
  1139. VPRORD $0x08, Z24, Z24
  1140. VPADDD Z16, Z24, Z16
  1141. VPXORD Z8, Z16, Z8
  1142. VPRORD $0x07, Z8, Z8
  1143. VPADDD Z2, Z10, Z2
  1144. VPADDD Z7, Z2, Z2
  1145. VPXORD Z26, Z2, Z26
  1146. VPRORD $0x10, Z26, Z26
  1147. VPADDD Z18, Z26, Z18
  1148. VPXORD Z10, Z18, Z10
  1149. VPRORD $0x0c, Z10, Z10
  1150. VPADDD Z2, Z10, Z2
  1151. VPADDD Z21, Z2, Z2
  1152. VPXORD Z26, Z2, Z26
  1153. VPRORD $0x08, Z26, Z26
  1154. VPADDD Z18, Z26, Z18
  1155. VPXORD Z10, Z18, Z10
  1156. VPRORD $0x07, Z10, Z10
  1157. VPADDD Z4, Z12, Z4
  1158. VPADDD Z15, Z4, Z4
  1159. VPXORD Z28, Z4, Z28
  1160. VPRORD $0x10, Z28, Z28
  1161. VPADDD Z20, Z28, Z20
  1162. VPXORD Z12, Z20, Z12
  1163. VPRORD $0x0c, Z12, Z12
  1164. VPADDD Z4, Z12, Z4
  1165. VPADDD Z1, Z4, Z4
  1166. VPXORD Z28, Z4, Z28
  1167. VPRORD $0x08, Z28, Z28
  1168. VPADDD Z20, Z28, Z20
  1169. VPXORD Z12, Z20, Z12
  1170. VPRORD $0x07, Z12, Z12
  1171. VPADDD Z6, Z14, Z6
  1172. VPADDD Z9, Z6, Z6
  1173. VPXORD Z30, Z6, Z30
  1174. VPRORD $0x10, Z30, Z30
  1175. VPADDD Z22, Z30, Z22
  1176. VPXORD Z14, Z22, Z14
  1177. VPRORD $0x0c, Z14, Z14
  1178. VPADDD Z6, Z14, Z6
  1179. VPADDD Z27, Z6, Z6
  1180. VPXORD Z30, Z6, Z30
  1181. VPRORD $0x08, Z30, Z30
  1182. VPADDD Z22, Z30, Z22
  1183. VPXORD Z14, Z22, Z14
  1184. VPRORD $0x07, Z14, Z14
  1185. VPADDD Z0, Z10, Z0
  1186. VPADDD Z3, Z0, Z0
  1187. VPXORD Z30, Z0, Z30
  1188. VPRORD $0x10, Z30, Z30
  1189. VPADDD Z20, Z30, Z20
  1190. VPXORD Z10, Z20, Z10
  1191. VPRORD $0x0c, Z10, Z10
  1192. VPADDD Z0, Z10, Z0
  1193. VPADDD Z23, Z0, Z0
  1194. VPXORD Z30, Z0, Z30
  1195. VPRORD $0x08, Z30, Z30
  1196. VPADDD Z20, Z30, Z20
  1197. VPXORD Z10, Z20, Z10
  1198. VPRORD $0x07, Z10, Z10
  1199. VPADDD Z2, Z12, Z2
  1200. VPADDD Z25, Z2, Z2
  1201. VPXORD Z24, Z2, Z24
  1202. VPRORD $0x10, Z24, Z24
  1203. VPADDD Z22, Z24, Z22
  1204. VPXORD Z12, Z22, Z12
  1205. VPRORD $0x0c, Z12, Z12
  1206. VPADDD Z2, Z12, Z2
  1207. VPADDD Z11, Z2, Z2
  1208. VPXORD Z24, Z2, Z24
  1209. VPRORD $0x08, Z24, Z24
  1210. VPADDD Z22, Z24, Z22
  1211. VPXORD Z12, Z22, Z12
  1212. VPRORD $0x07, Z12, Z12
  1213. VPADDD Z4, Z14, Z4
  1214. VPADDD Z19, Z4, Z4
  1215. VPXORD Z26, Z4, Z26
  1216. VPRORD $0x10, Z26, Z26
  1217. VPADDD Z16, Z26, Z16
  1218. VPXORD Z14, Z16, Z14
  1219. VPRORD $0x0c, Z14, Z14
  1220. VPADDD Z4, Z14, Z4
  1221. VPADDD Z29, Z4, Z4
  1222. VPXORD Z26, Z4, Z26
  1223. VPRORD $0x08, Z26, Z26
  1224. VPADDD Z16, Z26, Z16
  1225. VPXORD Z14, Z16, Z14
  1226. VPRORD $0x07, Z14, Z14
  1227. VPADDD Z6, Z8, Z6
  1228. VPADDD Z31, Z6, Z6
  1229. VPXORD Z28, Z6, Z28
  1230. VPRORD $0x10, Z28, Z28
  1231. VPADDD Z18, Z28, Z18
  1232. VPXORD Z8, Z18, Z8
  1233. VPRORD $0x0c, Z8, Z8
  1234. VPADDD Z6, Z8, Z6
  1235. VPADDD Z17, Z6, Z6
  1236. VPXORD Z28, Z6, Z28
  1237. VPRORD $0x08, Z28, Z28
  1238. VPADDD Z18, Z28, Z18
  1239. VPXORD Z8, Z18, Z8
  1240. VPRORD $0x07, Z8, Z8
  1241. // Round 3
  1242. VPADDD Z0, Z8, Z0
  1243. VPADDD Z7, Z0, Z0
  1244. VPXORD Z24, Z0, Z24
  1245. VPRORD $0x10, Z24, Z24
  1246. VPADDD Z16, Z24, Z16
  1247. VPXORD Z8, Z16, Z8
  1248. VPRORD $0x0c, Z8, Z8
  1249. VPADDD Z0, Z8, Z0
  1250. VPADDD Z9, Z0, Z0
  1251. VPXORD Z24, Z0, Z24
  1252. VPRORD $0x08, Z24, Z24
  1253. VPADDD Z16, Z24, Z16
  1254. VPXORD Z8, Z16, Z8
  1255. VPRORD $0x07, Z8, Z8
  1256. VPADDD Z2, Z10, Z2
  1257. VPADDD Z21, Z2, Z2
  1258. VPXORD Z26, Z2, Z26
  1259. VPRORD $0x10, Z26, Z26
  1260. VPADDD Z18, Z26, Z18
  1261. VPXORD Z10, Z18, Z10
  1262. VPRORD $0x0c, Z10, Z10
  1263. VPADDD Z2, Z10, Z2
  1264. VPADDD Z25, Z2, Z2
  1265. VPXORD Z26, Z2, Z26
  1266. VPRORD $0x08, Z26, Z26
  1267. VPADDD Z18, Z26, Z18
  1268. VPXORD Z10, Z18, Z10
  1269. VPRORD $0x07, Z10, Z10
  1270. VPADDD Z4, Z12, Z4
  1271. VPADDD Z27, Z4, Z4
  1272. VPXORD Z28, Z4, Z28
  1273. VPRORD $0x10, Z28, Z28
  1274. VPADDD Z20, Z28, Z20
  1275. VPXORD Z12, Z20, Z12
  1276. VPRORD $0x0c, Z12, Z12
  1277. VPADDD Z4, Z12, Z4
  1278. VPADDD Z5, Z4, Z4
  1279. VPXORD Z28, Z4, Z28
  1280. VPRORD $0x08, Z28, Z28
  1281. VPADDD Z20, Z28, Z20
  1282. VPXORD Z12, Z20, Z12
  1283. VPRORD $0x07, Z12, Z12
  1284. VPADDD Z6, Z14, Z6
  1285. VPADDD Z15, Z6, Z6
  1286. VPXORD Z30, Z6, Z30
  1287. VPRORD $0x10, Z30, Z30
  1288. VPADDD Z22, Z30, Z22
  1289. VPXORD Z14, Z22, Z14
  1290. VPRORD $0x0c, Z14, Z14
  1291. VPADDD Z6, Z14, Z6
  1292. VPADDD Z29, Z6, Z6
  1293. VPXORD Z30, Z6, Z30
  1294. VPRORD $0x08, Z30, Z30
  1295. VPADDD Z22, Z30, Z22
  1296. VPXORD Z14, Z22, Z14
  1297. VPRORD $0x07, Z14, Z14
  1298. VPADDD Z0, Z10, Z0
  1299. VPADDD Z13, Z0, Z0
  1300. VPXORD Z30, Z0, Z30
  1301. VPRORD $0x10, Z30, Z30
  1302. VPADDD Z20, Z30, Z20
  1303. VPXORD Z10, Z20, Z10
  1304. VPRORD $0x0c, Z10, Z10
  1305. VPADDD Z0, Z10, Z0
  1306. VPADDD Z11, Z0, Z0
  1307. VPXORD Z30, Z0, Z30
  1308. VPRORD $0x08, Z30, Z30
  1309. VPADDD Z20, Z30, Z20
  1310. VPXORD Z10, Z20, Z10
  1311. VPRORD $0x07, Z10, Z10
  1312. VPADDD Z2, Z12, Z2
  1313. VPADDD Z19, Z2, Z2
  1314. VPXORD Z24, Z2, Z24
  1315. VPRORD $0x10, Z24, Z24
  1316. VPADDD Z22, Z24, Z22
  1317. VPXORD Z12, Z22, Z12
  1318. VPRORD $0x0c, Z12, Z12
  1319. VPADDD Z2, Z12, Z2
  1320. VPADDD Z1, Z2, Z2
  1321. VPXORD Z24, Z2, Z24
  1322. VPRORD $0x08, Z24, Z24
  1323. VPADDD Z22, Z24, Z22
  1324. VPXORD Z12, Z22, Z12
  1325. VPRORD $0x07, Z12, Z12
  1326. VPADDD Z4, Z14, Z4
  1327. VPADDD Z23, Z4, Z4
  1328. VPXORD Z26, Z4, Z26
  1329. VPRORD $0x10, Z26, Z26
  1330. VPADDD Z16, Z26, Z16
  1331. VPXORD Z14, Z16, Z14
  1332. VPRORD $0x0c, Z14, Z14
  1333. VPADDD Z4, Z14, Z4
  1334. VPADDD Z31, Z4, Z4
  1335. VPXORD Z26, Z4, Z26
  1336. VPRORD $0x08, Z26, Z26
  1337. VPADDD Z16, Z26, Z16
  1338. VPXORD Z14, Z16, Z14
  1339. VPRORD $0x07, Z14, Z14
  1340. VPADDD Z6, Z8, Z6
  1341. VPADDD Z17, Z6, Z6
  1342. VPXORD Z28, Z6, Z28
  1343. VPRORD $0x10, Z28, Z28
  1344. VPADDD Z18, Z28, Z18
  1345. VPXORD Z8, Z18, Z8
  1346. VPRORD $0x0c, Z8, Z8
  1347. VPADDD Z6, Z8, Z6
  1348. VPADDD Z3, Z6, Z6
  1349. VPXORD Z28, Z6, Z28
  1350. VPRORD $0x08, Z28, Z28
  1351. VPADDD Z18, Z28, Z18
  1352. VPXORD Z8, Z18, Z8
  1353. VPRORD $0x07, Z8, Z8
  1354. // Round 4
  1355. VPADDD Z0, Z8, Z0
  1356. VPADDD Z21, Z0, Z0
  1357. VPXORD Z24, Z0, Z24
  1358. VPRORD $0x10, Z24, Z24
  1359. VPADDD Z16, Z24, Z16
  1360. VPXORD Z8, Z16, Z8
  1361. VPRORD $0x0c, Z8, Z8
  1362. VPADDD Z0, Z8, Z0
  1363. VPADDD Z15, Z0, Z0
  1364. VPXORD Z24, Z0, Z24
  1365. VPRORD $0x08, Z24, Z24
  1366. VPADDD Z16, Z24, Z16
  1367. VPXORD Z8, Z16, Z8
  1368. VPRORD $0x07, Z8, Z8
  1369. VPADDD Z2, Z10, Z2
  1370. VPADDD Z25, Z2, Z2
  1371. VPXORD Z26, Z2, Z26
  1372. VPRORD $0x10, Z26, Z26
  1373. VPADDD Z18, Z26, Z18
  1374. VPXORD Z10, Z18, Z10
  1375. VPRORD $0x0c, Z10, Z10
  1376. VPADDD Z2, Z10, Z2
  1377. VPADDD Z19, Z2, Z2
  1378. VPXORD Z26, Z2, Z26
  1379. VPRORD $0x08, Z26, Z26
  1380. VPADDD Z18, Z26, Z18
  1381. VPXORD Z10, Z18, Z10
  1382. VPRORD $0x07, Z10, Z10
  1383. VPADDD Z4, Z12, Z4
  1384. VPADDD Z29, Z4, Z4
  1385. VPXORD Z28, Z4, Z28
  1386. VPRORD $0x10, Z28, Z28
  1387. VPADDD Z20, Z28, Z20
  1388. VPXORD Z12, Z20, Z12
  1389. VPRORD $0x0c, Z12, Z12
  1390. VPADDD Z4, Z12, Z4
  1391. VPADDD Z7, Z4, Z4
  1392. VPXORD Z28, Z4, Z28
  1393. VPRORD $0x08, Z28, Z28
  1394. VPADDD Z20, Z28, Z20
  1395. VPXORD Z12, Z20, Z12
  1396. VPRORD $0x07, Z12, Z12
  1397. VPADDD Z6, Z14, Z6
  1398. VPADDD Z27, Z6, Z6
  1399. VPXORD Z30, Z6, Z30
  1400. VPRORD $0x10, Z30, Z30
  1401. VPADDD Z22, Z30, Z22
  1402. VPXORD Z14, Z22, Z14
  1403. VPRORD $0x0c, Z14, Z14
  1404. VPADDD Z6, Z14, Z6
  1405. VPADDD Z31, Z6, Z6
  1406. VPXORD Z30, Z6, Z30
  1407. VPRORD $0x08, Z30, Z30
  1408. VPADDD Z22, Z30, Z22
  1409. VPXORD Z14, Z22, Z14
  1410. VPRORD $0x07, Z14, Z14
  1411. VPADDD Z0, Z10, Z0
  1412. VPADDD Z9, Z0, Z0
  1413. VPXORD Z30, Z0, Z30
  1414. VPRORD $0x10, Z30, Z30
  1415. VPADDD Z20, Z30, Z20
  1416. VPXORD Z10, Z20, Z10
  1417. VPRORD $0x0c, Z10, Z10
  1418. VPADDD Z0, Z10, Z0
  1419. VPADDD Z1, Z0, Z0
  1420. VPXORD Z30, Z0, Z30
  1421. VPRORD $0x08, Z30, Z30
  1422. VPADDD Z20, Z30, Z20
  1423. VPXORD Z10, Z20, Z10
  1424. VPRORD $0x07, Z10, Z10
  1425. VPADDD Z2, Z12, Z2
  1426. VPADDD Z23, Z2, Z2
  1427. VPXORD Z24, Z2, Z24
  1428. VPRORD $0x10, Z24, Z24
  1429. VPADDD Z22, Z24, Z22
  1430. VPXORD Z12, Z22, Z12
  1431. VPRORD $0x0c, Z12, Z12
  1432. VPADDD Z2, Z12, Z2
  1433. VPADDD Z5, Z2, Z2
  1434. VPXORD Z24, Z2, Z24
  1435. VPRORD $0x08, Z24, Z24
  1436. VPADDD Z22, Z24, Z22
  1437. VPXORD Z12, Z22, Z12
  1438. VPRORD $0x07, Z12, Z12
  1439. VPADDD Z4, Z14, Z4
  1440. VPADDD Z11, Z4, Z4
  1441. VPXORD Z26, Z4, Z26
  1442. VPRORD $0x10, Z26, Z26
  1443. VPADDD Z16, Z26, Z16
  1444. VPXORD Z14, Z16, Z14
  1445. VPRORD $0x0c, Z14, Z14
  1446. VPADDD Z4, Z14, Z4
  1447. VPADDD Z17, Z4, Z4
  1448. VPXORD Z26, Z4, Z26
  1449. VPRORD $0x08, Z26, Z26
  1450. VPADDD Z16, Z26, Z16
  1451. VPXORD Z14, Z16, Z14
  1452. VPRORD $0x07, Z14, Z14
  1453. VPADDD Z6, Z8, Z6
  1454. VPADDD Z3, Z6, Z6
  1455. VPXORD Z28, Z6, Z28
  1456. VPRORD $0x10, Z28, Z28
  1457. VPADDD Z18, Z28, Z18
  1458. VPXORD Z8, Z18, Z8
  1459. VPRORD $0x0c, Z8, Z8
  1460. VPADDD Z6, Z8, Z6
  1461. VPADDD Z13, Z6, Z6
  1462. VPXORD Z28, Z6, Z28
  1463. VPRORD $0x08, Z28, Z28
  1464. VPADDD Z18, Z28, Z18
  1465. VPXORD Z8, Z18, Z8
  1466. VPRORD $0x07, Z8, Z8
  1467. // Round 5
  1468. VPADDD Z0, Z8, Z0
  1469. VPADDD Z25, Z0, Z0
  1470. VPXORD Z24, Z0, Z24
  1471. VPRORD $0x10, Z24, Z24
  1472. VPADDD Z16, Z24, Z16
  1473. VPXORD Z8, Z16, Z8
  1474. VPRORD $0x0c, Z8, Z8
  1475. VPADDD Z0, Z8, Z0
  1476. VPADDD Z27, Z0, Z0
  1477. VPXORD Z24, Z0, Z24
  1478. VPRORD $0x08, Z24, Z24
  1479. VPADDD Z16, Z24, Z16
  1480. VPXORD Z8, Z16, Z8
  1481. VPRORD $0x07, Z8, Z8
  1482. VPADDD Z2, Z10, Z2
  1483. VPADDD Z19, Z2, Z2
  1484. VPXORD Z26, Z2, Z26
  1485. VPRORD $0x10, Z26, Z26
  1486. VPADDD Z18, Z26, Z18
  1487. VPXORD Z10, Z18, Z10
  1488. VPRORD $0x0c, Z10, Z10
  1489. VPADDD Z2, Z10, Z2
  1490. VPADDD Z23, Z2, Z2
  1491. VPXORD Z26, Z2, Z26
  1492. VPRORD $0x08, Z26, Z26
  1493. VPADDD Z18, Z26, Z18
  1494. VPXORD Z10, Z18, Z10
  1495. VPRORD $0x07, Z10, Z10
  1496. VPADDD Z4, Z12, Z4
  1497. VPADDD Z31, Z4, Z4
  1498. VPXORD Z28, Z4, Z28
  1499. VPRORD $0x10, Z28, Z28
  1500. VPADDD Z20, Z28, Z20
  1501. VPXORD Z12, Z20, Z12
  1502. VPRORD $0x0c, Z12, Z12
  1503. VPADDD Z4, Z12, Z4
  1504. VPADDD Z21, Z4, Z4
  1505. VPXORD Z28, Z4, Z28
  1506. VPRORD $0x08, Z28, Z28
  1507. VPADDD Z20, Z28, Z20
  1508. VPXORD Z12, Z20, Z12
  1509. VPRORD $0x07, Z12, Z12
  1510. VPADDD Z6, Z14, Z6
  1511. VPADDD Z29, Z6, Z6
  1512. VPXORD Z30, Z6, Z30
  1513. VPRORD $0x10, Z30, Z30
  1514. VPADDD Z22, Z30, Z22
  1515. VPXORD Z14, Z22, Z14
  1516. VPRORD $0x0c, Z14, Z14
  1517. VPADDD Z6, Z14, Z6
  1518. VPADDD Z17, Z6, Z6
  1519. VPXORD Z30, Z6, Z30
  1520. VPRORD $0x08, Z30, Z30
  1521. VPADDD Z22, Z30, Z22
  1522. VPXORD Z14, Z22, Z14
  1523. VPRORD $0x07, Z14, Z14
  1524. VPADDD Z0, Z10, Z0
  1525. VPADDD Z15, Z0, Z0
  1526. VPXORD Z30, Z0, Z30
  1527. VPRORD $0x10, Z30, Z30
  1528. VPADDD Z20, Z30, Z20
  1529. VPXORD Z10, Z20, Z10
  1530. VPRORD $0x0c, Z10, Z10
  1531. VPADDD Z0, Z10, Z0
  1532. VPADDD Z5, Z0, Z0
  1533. VPXORD Z30, Z0, Z30
  1534. VPRORD $0x08, Z30, Z30
  1535. VPADDD Z20, Z30, Z20
  1536. VPXORD Z10, Z20, Z10
  1537. VPRORD $0x07, Z10, Z10
  1538. VPADDD Z2, Z12, Z2
  1539. VPADDD Z11, Z2, Z2
  1540. VPXORD Z24, Z2, Z24
  1541. VPRORD $0x10, Z24, Z24
  1542. VPADDD Z22, Z24, Z22
  1543. VPXORD Z12, Z22, Z12
  1544. VPRORD $0x0c, Z12, Z12
  1545. VPADDD Z2, Z12, Z2
  1546. VPADDD Z7, Z2, Z2
  1547. VPXORD Z24, Z2, Z24
  1548. VPRORD $0x08, Z24, Z24
  1549. VPADDD Z22, Z24, Z22
  1550. VPXORD Z12, Z22, Z12
  1551. VPRORD $0x07, Z12, Z12
  1552. VPADDD Z4, Z14, Z4
  1553. VPADDD Z1, Z4, Z4
  1554. VPXORD Z26, Z4, Z26
  1555. VPRORD $0x10, Z26, Z26
  1556. VPADDD Z16, Z26, Z16
  1557. VPXORD Z14, Z16, Z14
  1558. VPRORD $0x0c, Z14, Z14
  1559. VPADDD Z4, Z14, Z4
  1560. VPADDD Z3, Z4, Z4
  1561. VPXORD Z26, Z4, Z26
  1562. VPRORD $0x08, Z26, Z26
  1563. VPADDD Z16, Z26, Z16
  1564. VPXORD Z14, Z16, Z14
  1565. VPRORD $0x07, Z14, Z14
  1566. VPADDD Z6, Z8, Z6
  1567. VPADDD Z13, Z6, Z6
  1568. VPXORD Z28, Z6, Z28
  1569. VPRORD $0x10, Z28, Z28
  1570. VPADDD Z18, Z28, Z18
  1571. VPXORD Z8, Z18, Z8
  1572. VPRORD $0x0c, Z8, Z8
  1573. VPADDD Z6, Z8, Z6
  1574. VPADDD Z9, Z6, Z6
  1575. VPXORD Z28, Z6, Z28
  1576. VPRORD $0x08, Z28, Z28
  1577. VPADDD Z18, Z28, Z18
  1578. VPXORD Z8, Z18, Z8
  1579. VPRORD $0x07, Z8, Z8
  1580. // Round 6
  1581. VPADDD Z0, Z8, Z0
  1582. VPADDD Z19, Z0, Z0
  1583. VPXORD Z24, Z0, Z24
  1584. VPRORD $0x10, Z24, Z24
  1585. VPADDD Z16, Z24, Z16
  1586. VPXORD Z8, Z16, Z8
  1587. VPRORD $0x0c, Z8, Z8
  1588. VPADDD Z0, Z8, Z0
  1589. VPADDD Z29, Z0, Z0
  1590. VPXORD Z24, Z0, Z24
  1591. VPRORD $0x08, Z24, Z24
  1592. VPADDD Z16, Z24, Z16
  1593. VPXORD Z8, Z16, Z8
  1594. VPRORD $0x07, Z8, Z8
  1595. VPADDD Z2, Z10, Z2
  1596. VPADDD Z23, Z2, Z2
  1597. VPXORD Z26, Z2, Z26
  1598. VPRORD $0x10, Z26, Z26
  1599. VPADDD Z18, Z26, Z18
  1600. VPXORD Z10, Z18, Z10
  1601. VPRORD $0x0c, Z10, Z10
  1602. VPADDD Z2, Z10, Z2
  1603. VPADDD Z11, Z2, Z2
  1604. VPXORD Z26, Z2, Z26
  1605. VPRORD $0x08, Z26, Z26
  1606. VPADDD Z18, Z26, Z18
  1607. VPXORD Z10, Z18, Z10
  1608. VPRORD $0x07, Z10, Z10
  1609. VPADDD Z4, Z12, Z4
  1610. VPADDD Z17, Z4, Z4
  1611. VPXORD Z28, Z4, Z28
  1612. VPRORD $0x10, Z28, Z28
  1613. VPADDD Z20, Z28, Z20
  1614. VPXORD Z12, Z20, Z12
  1615. VPRORD $0x0c, Z12, Z12
  1616. VPADDD Z4, Z12, Z4
  1617. VPADDD Z25, Z4, Z4
  1618. VPXORD Z28, Z4, Z28
  1619. VPRORD $0x08, Z28, Z28
  1620. VPADDD Z20, Z28, Z20
  1621. VPXORD Z12, Z20, Z12
  1622. VPRORD $0x07, Z12, Z12
  1623. VPADDD Z6, Z14, Z6
  1624. VPADDD Z31, Z6, Z6
  1625. VPXORD Z30, Z6, Z30
  1626. VPRORD $0x10, Z30, Z30
  1627. VPADDD Z22, Z30, Z22
  1628. VPXORD Z14, Z22, Z14
  1629. VPRORD $0x0c, Z14, Z14
  1630. VPADDD Z6, Z14, Z6
  1631. VPADDD Z3, Z6, Z6
  1632. VPXORD Z30, Z6, Z30
  1633. VPRORD $0x08, Z30, Z30
  1634. VPADDD Z22, Z30, Z22
  1635. VPXORD Z14, Z22, Z14
  1636. VPRORD $0x07, Z14, Z14
  1637. VPADDD Z0, Z10, Z0
  1638. VPADDD Z27, Z0, Z0
  1639. VPXORD Z30, Z0, Z30
  1640. VPRORD $0x10, Z30, Z30
  1641. VPADDD Z20, Z30, Z20
  1642. VPXORD Z10, Z20, Z10
  1643. VPRORD $0x0c, Z10, Z10
  1644. VPADDD Z0, Z10, Z0
  1645. VPADDD Z7, Z0, Z0
  1646. VPXORD Z30, Z0, Z30
  1647. VPRORD $0x08, Z30, Z30
  1648. VPADDD Z20, Z30, Z20
  1649. VPXORD Z10, Z20, Z10
  1650. VPRORD $0x07, Z10, Z10
  1651. VPADDD Z2, Z12, Z2
  1652. VPADDD Z1, Z2, Z2
  1653. VPXORD Z24, Z2, Z24
  1654. VPRORD $0x10, Z24, Z24
  1655. VPADDD Z22, Z24, Z22
  1656. VPXORD Z12, Z22, Z12
  1657. VPRORD $0x0c, Z12, Z12
  1658. VPADDD Z2, Z12, Z2
  1659. VPADDD Z21, Z2, Z2
  1660. VPXORD Z24, Z2, Z24
  1661. VPRORD $0x08, Z24, Z24
  1662. VPADDD Z22, Z24, Z22
  1663. VPXORD Z12, Z22, Z12
  1664. VPRORD $0x07, Z12, Z12
  1665. VPADDD Z4, Z14, Z4
  1666. VPADDD Z5, Z4, Z4
  1667. VPXORD Z26, Z4, Z26
  1668. VPRORD $0x10, Z26, Z26
  1669. VPADDD Z16, Z26, Z16
  1670. VPXORD Z14, Z16, Z14
  1671. VPRORD $0x0c, Z14, Z14
  1672. VPADDD Z4, Z14, Z4
  1673. VPADDD Z13, Z4, Z4
  1674. VPXORD Z26, Z4, Z26
  1675. VPRORD $0x08, Z26, Z26
  1676. VPADDD Z16, Z26, Z16
  1677. VPXORD Z14, Z16, Z14
  1678. VPRORD $0x07, Z14, Z14
  1679. VPADDD Z6, Z8, Z6
  1680. VPADDD Z9, Z6, Z6
  1681. VPXORD Z28, Z6, Z28
  1682. VPRORD $0x10, Z28, Z28
  1683. VPADDD Z18, Z28, Z18
  1684. VPXORD Z8, Z18, Z8
  1685. VPRORD $0x0c, Z8, Z8
  1686. VPADDD Z6, Z8, Z6
  1687. VPADDD Z15, Z6, Z6
  1688. VPXORD Z28, Z6, Z28
  1689. VPRORD $0x08, Z28, Z28
  1690. VPADDD Z18, Z28, Z18
  1691. VPXORD Z8, Z18, Z8
  1692. VPRORD $0x07, Z8, Z8
  1693. // Round 7
  1694. VPADDD Z0, Z8, Z0
  1695. VPADDD Z23, Z0, Z0
  1696. VPXORD Z24, Z0, Z24
  1697. VPRORD $0x10, Z24, Z24
  1698. VPADDD Z16, Z24, Z16
  1699. VPXORD Z8, Z16, Z8
  1700. VPRORD $0x0c, Z8, Z8
  1701. VPADDD Z0, Z8, Z0
  1702. VPADDD Z31, Z0, Z0
  1703. VPXORD Z24, Z0, Z24
  1704. VPRORD $0x08, Z24, Z24
  1705. VPADDD Z16, Z24, Z16
  1706. VPXORD Z8, Z16, Z8
  1707. VPRORD $0x07, Z8, Z8
  1708. VPADDD Z2, Z10, Z2
  1709. VPADDD Z11, Z2, Z2
  1710. VPXORD Z26, Z2, Z26
  1711. VPRORD $0x10, Z26, Z26
  1712. VPADDD Z18, Z26, Z18
  1713. VPXORD Z10, Z18, Z10
  1714. VPRORD $0x0c, Z10, Z10
  1715. VPADDD Z2, Z10, Z2
  1716. VPADDD Z1, Z2, Z2
  1717. VPXORD Z26, Z2, Z26
  1718. VPRORD $0x08, Z26, Z26
  1719. VPADDD Z18, Z26, Z18
  1720. VPXORD Z10, Z18, Z10
  1721. VPRORD $0x07, Z10, Z10
  1722. VPADDD Z4, Z12, Z4
  1723. VPADDD Z3, Z4, Z4
  1724. VPXORD Z28, Z4, Z28
  1725. VPRORD $0x10, Z28, Z28
  1726. VPADDD Z20, Z28, Z20
  1727. VPXORD Z12, Z20, Z12
  1728. VPRORD $0x0c, Z12, Z12
  1729. VPADDD Z4, Z12, Z4
  1730. VPADDD Z19, Z4, Z4
  1731. VPXORD Z28, Z4, Z28
  1732. VPRORD $0x08, Z28, Z28
  1733. VPADDD Z20, Z28, Z20
  1734. VPXORD Z12, Z20, Z12
  1735. VPRORD $0x07, Z12, Z12
  1736. VPADDD Z6, Z14, Z6
  1737. VPADDD Z17, Z6, Z6
  1738. VPXORD Z30, Z6, Z30
  1739. VPRORD $0x10, Z30, Z30
  1740. VPADDD Z22, Z30, Z22
  1741. VPXORD Z14, Z22, Z14
  1742. VPRORD $0x0c, Z14, Z14
  1743. VPADDD Z6, Z14, Z6
  1744. VPADDD Z13, Z6, Z6
  1745. VPXORD Z30, Z6, Z30
  1746. VPRORD $0x08, Z30, Z30
  1747. VPADDD Z22, Z30, Z22
  1748. VPXORD Z14, Z22, Z14
  1749. VPRORD $0x07, Z14, Z14
  1750. VPADDD Z0, Z10, Z0
  1751. VPADDD Z29, Z0, Z0
  1752. VPXORD Z30, Z0, Z30
  1753. VPRORD $0x10, Z30, Z30
  1754. VPADDD Z20, Z30, Z20
  1755. VPXORD Z10, Z20, Z10
  1756. VPRORD $0x0c, Z10, Z10
  1757. VPADDD Z0, Z10, Z0
  1758. VPADDD Z21, Z0, Z0
  1759. VPXORD Z30, Z0, Z30
  1760. VPRORD $0x08, Z30, Z30
  1761. VPADDD Z20, Z30, Z20
  1762. VPXORD Z10, Z20, Z10
  1763. VPRORD $0x07, Z10, Z10
  1764. VPADDD Z2, Z12, Z2
  1765. VPADDD Z5, Z2, Z2
  1766. VPXORD Z24, Z2, Z24
  1767. VPRORD $0x10, Z24, Z24
  1768. VPADDD Z22, Z24, Z22
  1769. VPXORD Z12, Z22, Z12
  1770. VPRORD $0x0c, Z12, Z12
  1771. VPADDD Z2, Z12, Z2
  1772. VPADDD Z25, Z2, Z2
  1773. VPXORD Z24, Z2, Z24
  1774. VPRORD $0x08, Z24, Z24
  1775. VPADDD Z22, Z24, Z22
  1776. VPXORD Z12, Z22, Z12
  1777. VPRORD $0x07, Z12, Z12
  1778. VPADDD Z4, Z14, Z4
  1779. VPADDD Z7, Z4, Z4
  1780. VPXORD Z26, Z4, Z26
  1781. VPRORD $0x10, Z26, Z26
  1782. VPADDD Z16, Z26, Z16
  1783. VPXORD Z14, Z16, Z14
  1784. VPRORD $0x0c, Z14, Z14
  1785. VPADDD Z4, Z14, Z4
  1786. VPADDD Z9, Z4, Z4
  1787. VPXORD Z26, Z4, Z26
  1788. VPRORD $0x08, Z26, Z26
  1789. VPADDD Z16, Z26, Z16
  1790. VPXORD Z14, Z16, Z14
  1791. VPRORD $0x07, Z14, Z14
  1792. VPADDD Z6, Z8, Z6
  1793. VPADDD Z15, Z6, Z6
  1794. VPXORD Z28, Z6, Z28
  1795. VPRORD $0x10, Z28, Z28
  1796. VPADDD Z18, Z28, Z18
  1797. VPXORD Z8, Z18, Z8
  1798. VPRORD $0x0c, Z8, Z8
  1799. VPADDD Z6, Z8, Z6
  1800. VPADDD Z27, Z6, Z6
  1801. VPXORD Z28, Z6, Z28
  1802. VPRORD $0x08, Z28, Z28
  1803. VPADDD Z18, Z28, Z18
  1804. VPXORD Z8, Z18, Z8
  1805. VPRORD $0x07, Z8, Z8
  1806. // Finalize CVs
  1807. VPXORD Z0, Z16, Z0
  1808. VPXORD Z2, Z18, Z2
  1809. VPXORD Z4, Z20, Z4
  1810. VPXORD Z6, Z22, Z6
  1811. VPXORD Z8, Z24, Z8
  1812. VPXORD Z10, Z26, Z10
  1813. VPXORD Z12, Z28, Z12
  1814. VPXORD Z14, Z30, Z14
  1815. // Loop
  1816. INCQ DX
  1817. CMPQ DX, $0x00000010
  1818. JNE loop
  1819. // Finished; transpose CVs
  1820. VMOVDQU32 seq<>+0(SB), Z16
  1821. VPSLLD $0x05, Z16, Z16
  1822. KXNORD K1, K1, K1
  1823. VPSCATTERDD Z0, K1, (AX)(Z16*1)
  1824. KXNORD K1, K1, K1
  1825. VPSCATTERDD Z2, K1, 4(AX)(Z16*1)
  1826. KXNORD K1, K1, K1
  1827. VPSCATTERDD Z4, K1, 8(AX)(Z16*1)
  1828. KXNORD K1, K1, K1
  1829. VPSCATTERDD Z6, K1, 12(AX)(Z16*1)
  1830. KXNORD K1, K1, K1
  1831. VPSCATTERDD Z8, K1, 16(AX)(Z16*1)
  1832. KXNORD K1, K1, K1
  1833. VPSCATTERDD Z10, K1, 20(AX)(Z16*1)
  1834. KXNORD K1, K1, K1
  1835. VPSCATTERDD Z12, K1, 24(AX)(Z16*1)
  1836. KXNORD K1, K1, K1
  1837. VPSCATTERDD Z14, K1, 28(AX)(Z16*1)
  1838. RET
  1839. // func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
  1840. // Requires: AVX, AVX2
  1841. TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40
  1842. MOVQ out+0(FP), AX
  1843. MOVQ block+8(FP), CX
  1844. MOVQ cv+16(FP), DX
  1845. // Load block
  1846. VPBROADCASTD (CX), Y0
  1847. VMOVDQU Y0, (SP)
  1848. VPBROADCASTD 4(CX), Y0
  1849. VMOVDQU Y0, 32(SP)
  1850. VPBROADCASTD 8(CX), Y0
  1851. VMOVDQU Y0, 64(SP)
  1852. VPBROADCASTD 12(CX), Y0
  1853. VMOVDQU Y0, 96(SP)
  1854. VPBROADCASTD 16(CX), Y0
  1855. VMOVDQU Y0, 128(SP)
  1856. VPBROADCASTD 20(CX), Y0
  1857. VMOVDQU Y0, 160(SP)
  1858. VPBROADCASTD 24(CX), Y0
  1859. VMOVDQU Y0, 192(SP)
  1860. VPBROADCASTD 28(CX), Y0
  1861. VMOVDQU Y0, 224(SP)
  1862. VPBROADCASTD 32(CX), Y0
  1863. VMOVDQU Y0, 256(SP)
  1864. VPBROADCASTD 36(CX), Y0
  1865. VMOVDQU Y0, 288(SP)
  1866. VPBROADCASTD 40(CX), Y0
  1867. VMOVDQU Y0, 320(SP)
  1868. VPBROADCASTD 44(CX), Y0
  1869. VMOVDQU Y0, 352(SP)
  1870. VPBROADCASTD 48(CX), Y0
  1871. VMOVDQU Y0, 384(SP)
  1872. VPBROADCASTD 52(CX), Y0
  1873. VMOVDQU Y0, 416(SP)
  1874. VPBROADCASTD 56(CX), Y0
  1875. VMOVDQU Y0, 448(SP)
  1876. VPBROADCASTD 60(CX), Y0
  1877. VMOVDQU Y0, 480(SP)
  1878. // Initialize state vectors
  1879. VPBROADCASTD (DX), Y0
  1880. VPBROADCASTD 4(DX), Y1
  1881. VPBROADCASTD 8(DX), Y2
  1882. VPBROADCASTD 12(DX), Y3
  1883. VPBROADCASTD 16(DX), Y4
  1884. VPBROADCASTD 20(DX), Y5
  1885. VPBROADCASTD 24(DX), Y6
  1886. VPBROADCASTD 28(DX), Y7
  1887. VPBROADCASTD iv<>+0(SB), Y8
  1888. VPBROADCASTD iv<>+4(SB), Y9
  1889. VPBROADCASTD iv<>+8(SB), Y10
  1890. VPBROADCASTD iv<>+12(SB), Y11
  1891. VPBROADCASTQ counter+24(FP), Y12
  1892. VPBROADCASTQ counter+24(FP), Y13
  1893. VPADDQ seq64<>+0(SB), Y12, Y12
  1894. VPADDQ seq64<>+32(SB), Y13, Y13
  1895. VPUNPCKLDQ Y13, Y12, Y14
  1896. VPUNPCKHDQ Y13, Y12, Y15
  1897. VPUNPCKLDQ Y15, Y14, Y12
  1898. VPUNPCKHDQ Y15, Y14, Y13
  1899. VPERMQ $0xd8, Y12, Y12
  1900. VPERMQ $0xd8, Y13, Y13
  1901. VPBROADCASTD blockLen+32(FP), Y14
  1902. VPBROADCASTD flags+36(FP), Y15
  1903. VMOVDQU Y8, 512(SP)
  1904. // Round 1
  1905. VPADDD Y0, Y4, Y0
  1906. VPADDD (SP), Y0, Y0
  1907. VPXOR Y12, Y0, Y12
  1908. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  1909. VMOVDQU 512(SP), Y8
  1910. VPADDD Y8, Y12, Y8
  1911. VPXOR Y4, Y8, Y4
  1912. VMOVDQU Y8, 512(SP)
  1913. VPSRLD $0x0c, Y4, Y8
  1914. VPSLLD $0x14, Y4, Y4
  1915. VPOR Y4, Y8, Y4
  1916. VPADDD Y0, Y4, Y0
  1917. VPADDD 32(SP), Y0, Y0
  1918. VPXOR Y12, Y0, Y12
  1919. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  1920. VMOVDQU 512(SP), Y8
  1921. VPADDD Y8, Y12, Y8
  1922. VPXOR Y4, Y8, Y4
  1923. VMOVDQU Y8, 512(SP)
  1924. VPSRLD $0x07, Y4, Y8
  1925. VPSLLD $0x19, Y4, Y4
  1926. VPOR Y4, Y8, Y4
  1927. VPADDD Y1, Y5, Y1
  1928. VPADDD 64(SP), Y1, Y1
  1929. VPXOR Y13, Y1, Y13
  1930. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  1931. VPADDD Y9, Y13, Y9
  1932. VPXOR Y5, Y9, Y5
  1933. VPSRLD $0x0c, Y5, Y8
  1934. VPSLLD $0x14, Y5, Y5
  1935. VPOR Y5, Y8, Y5
  1936. VPADDD Y1, Y5, Y1
  1937. VPADDD 96(SP), Y1, Y1
  1938. VPXOR Y13, Y1, Y13
  1939. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  1940. VPADDD Y9, Y13, Y9
  1941. VPXOR Y5, Y9, Y5
  1942. VPSRLD $0x07, Y5, Y8
  1943. VPSLLD $0x19, Y5, Y5
  1944. VPOR Y5, Y8, Y5
  1945. VPADDD Y2, Y6, Y2
  1946. VPADDD 128(SP), Y2, Y2
  1947. VPXOR Y14, Y2, Y14
  1948. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  1949. VPADDD Y10, Y14, Y10
  1950. VPXOR Y6, Y10, Y6
  1951. VPSRLD $0x0c, Y6, Y8
  1952. VPSLLD $0x14, Y6, Y6
  1953. VPOR Y6, Y8, Y6
  1954. VPADDD Y2, Y6, Y2
  1955. VPADDD 160(SP), Y2, Y2
  1956. VPXOR Y14, Y2, Y14
  1957. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  1958. VPADDD Y10, Y14, Y10
  1959. VPXOR Y6, Y10, Y6
  1960. VPSRLD $0x07, Y6, Y8
  1961. VPSLLD $0x19, Y6, Y6
  1962. VPOR Y6, Y8, Y6
  1963. VPADDD Y3, Y7, Y3
  1964. VPADDD 192(SP), Y3, Y3
  1965. VPXOR Y15, Y3, Y15
  1966. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  1967. VPADDD Y11, Y15, Y11
  1968. VPXOR Y7, Y11, Y7
  1969. VPSRLD $0x0c, Y7, Y8
  1970. VPSLLD $0x14, Y7, Y7
  1971. VPOR Y7, Y8, Y7
  1972. VPADDD Y3, Y7, Y3
  1973. VPADDD 224(SP), Y3, Y3
  1974. VPXOR Y15, Y3, Y15
  1975. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  1976. VPADDD Y11, Y15, Y11
  1977. VPXOR Y7, Y11, Y7
  1978. VPSRLD $0x07, Y7, Y8
  1979. VPSLLD $0x19, Y7, Y7
  1980. VPOR Y7, Y8, Y7
  1981. VPADDD Y0, Y5, Y0
  1982. VPADDD 256(SP), Y0, Y0
  1983. VPXOR Y15, Y0, Y15
  1984. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  1985. VPADDD Y10, Y15, Y10
  1986. VPXOR Y5, Y10, Y5
  1987. VPSRLD $0x0c, Y5, Y8
  1988. VPSLLD $0x14, Y5, Y5
  1989. VPOR Y5, Y8, Y5
  1990. VPADDD Y0, Y5, Y0
  1991. VPADDD 288(SP), Y0, Y0
  1992. VPXOR Y15, Y0, Y15
  1993. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  1994. VPADDD Y10, Y15, Y10
  1995. VPXOR Y5, Y10, Y5
  1996. VPSRLD $0x07, Y5, Y8
  1997. VPSLLD $0x19, Y5, Y5
  1998. VPOR Y5, Y8, Y5
  1999. VPADDD Y1, Y6, Y1
  2000. VPADDD 320(SP), Y1, Y1
  2001. VPXOR Y12, Y1, Y12
  2002. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2003. VPADDD Y11, Y12, Y11
  2004. VPXOR Y6, Y11, Y6
  2005. VPSRLD $0x0c, Y6, Y8
  2006. VPSLLD $0x14, Y6, Y6
  2007. VPOR Y6, Y8, Y6
  2008. VPADDD Y1, Y6, Y1
  2009. VPADDD 352(SP), Y1, Y1
  2010. VPXOR Y12, Y1, Y12
  2011. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2012. VPADDD Y11, Y12, Y11
  2013. VPXOR Y6, Y11, Y6
  2014. VPSRLD $0x07, Y6, Y8
  2015. VPSLLD $0x19, Y6, Y6
  2016. VPOR Y6, Y8, Y6
  2017. VPADDD Y2, Y7, Y2
  2018. VPADDD 384(SP), Y2, Y2
  2019. VPXOR Y13, Y2, Y13
  2020. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2021. VMOVDQU 512(SP), Y8
  2022. VPADDD Y8, Y13, Y8
  2023. VPXOR Y7, Y8, Y7
  2024. VMOVDQU Y8, 512(SP)
  2025. VPSRLD $0x0c, Y7, Y8
  2026. VPSLLD $0x14, Y7, Y7
  2027. VPOR Y7, Y8, Y7
  2028. VPADDD Y2, Y7, Y2
  2029. VPADDD 416(SP), Y2, Y2
  2030. VPXOR Y13, Y2, Y13
  2031. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2032. VMOVDQU 512(SP), Y8
  2033. VPADDD Y8, Y13, Y8
  2034. VPXOR Y7, Y8, Y7
  2035. VMOVDQU Y8, 512(SP)
  2036. VPSRLD $0x07, Y7, Y8
  2037. VPSLLD $0x19, Y7, Y7
  2038. VPOR Y7, Y8, Y7
  2039. VPADDD Y3, Y4, Y3
  2040. VPADDD 448(SP), Y3, Y3
  2041. VPXOR Y14, Y3, Y14
  2042. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2043. VPADDD Y9, Y14, Y9
  2044. VPXOR Y4, Y9, Y4
  2045. VPSRLD $0x0c, Y4, Y8
  2046. VPSLLD $0x14, Y4, Y4
  2047. VPOR Y4, Y8, Y4
  2048. VPADDD Y3, Y4, Y3
  2049. VPADDD 480(SP), Y3, Y3
  2050. VPXOR Y14, Y3, Y14
  2051. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2052. VPADDD Y9, Y14, Y9
  2053. VPXOR Y4, Y9, Y4
  2054. VPSRLD $0x07, Y4, Y8
  2055. VPSLLD $0x19, Y4, Y4
  2056. VPOR Y4, Y8, Y4
  2057. // Round 2
  2058. VPADDD Y0, Y4, Y0
  2059. VPADDD 64(SP), Y0, Y0
  2060. VPXOR Y12, Y0, Y12
  2061. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2062. VMOVDQU 512(SP), Y8
  2063. VPADDD Y8, Y12, Y8
  2064. VPXOR Y4, Y8, Y4
  2065. VMOVDQU Y8, 512(SP)
  2066. VPSRLD $0x0c, Y4, Y8
  2067. VPSLLD $0x14, Y4, Y4
  2068. VPOR Y4, Y8, Y4
  2069. VPADDD Y0, Y4, Y0
  2070. VPADDD 192(SP), Y0, Y0
  2071. VPXOR Y12, Y0, Y12
  2072. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2073. VMOVDQU 512(SP), Y8
  2074. VPADDD Y8, Y12, Y8
  2075. VPXOR Y4, Y8, Y4
  2076. VMOVDQU Y8, 512(SP)
  2077. VPSRLD $0x07, Y4, Y8
  2078. VPSLLD $0x19, Y4, Y4
  2079. VPOR Y4, Y8, Y4
  2080. VPADDD Y1, Y5, Y1
  2081. VPADDD 96(SP), Y1, Y1
  2082. VPXOR Y13, Y1, Y13
  2083. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2084. VPADDD Y9, Y13, Y9
  2085. VPXOR Y5, Y9, Y5
  2086. VPSRLD $0x0c, Y5, Y8
  2087. VPSLLD $0x14, Y5, Y5
  2088. VPOR Y5, Y8, Y5
  2089. VPADDD Y1, Y5, Y1
  2090. VPADDD 320(SP), Y1, Y1
  2091. VPXOR Y13, Y1, Y13
  2092. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2093. VPADDD Y9, Y13, Y9
  2094. VPXOR Y5, Y9, Y5
  2095. VPSRLD $0x07, Y5, Y8
  2096. VPSLLD $0x19, Y5, Y5
  2097. VPOR Y5, Y8, Y5
  2098. VPADDD Y2, Y6, Y2
  2099. VPADDD 224(SP), Y2, Y2
  2100. VPXOR Y14, Y2, Y14
  2101. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2102. VPADDD Y10, Y14, Y10
  2103. VPXOR Y6, Y10, Y6
  2104. VPSRLD $0x0c, Y6, Y8
  2105. VPSLLD $0x14, Y6, Y6
  2106. VPOR Y6, Y8, Y6
  2107. VPADDD Y2, Y6, Y2
  2108. VPADDD (SP), Y2, Y2
  2109. VPXOR Y14, Y2, Y14
  2110. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2111. VPADDD Y10, Y14, Y10
  2112. VPXOR Y6, Y10, Y6
  2113. VPSRLD $0x07, Y6, Y8
  2114. VPSLLD $0x19, Y6, Y6
  2115. VPOR Y6, Y8, Y6
  2116. VPADDD Y3, Y7, Y3
  2117. VPADDD 128(SP), Y3, Y3
  2118. VPXOR Y15, Y3, Y15
  2119. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2120. VPADDD Y11, Y15, Y11
  2121. VPXOR Y7, Y11, Y7
  2122. VPSRLD $0x0c, Y7, Y8
  2123. VPSLLD $0x14, Y7, Y7
  2124. VPOR Y7, Y8, Y7
  2125. VPADDD Y3, Y7, Y3
  2126. VPADDD 416(SP), Y3, Y3
  2127. VPXOR Y15, Y3, Y15
  2128. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2129. VPADDD Y11, Y15, Y11
  2130. VPXOR Y7, Y11, Y7
  2131. VPSRLD $0x07, Y7, Y8
  2132. VPSLLD $0x19, Y7, Y7
  2133. VPOR Y7, Y8, Y7
  2134. VPADDD Y0, Y5, Y0
  2135. VPADDD 32(SP), Y0, Y0
  2136. VPXOR Y15, Y0, Y15
  2137. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2138. VPADDD Y10, Y15, Y10
  2139. VPXOR Y5, Y10, Y5
  2140. VPSRLD $0x0c, Y5, Y8
  2141. VPSLLD $0x14, Y5, Y5
  2142. VPOR Y5, Y8, Y5
  2143. VPADDD Y0, Y5, Y0
  2144. VPADDD 352(SP), Y0, Y0
  2145. VPXOR Y15, Y0, Y15
  2146. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2147. VPADDD Y10, Y15, Y10
  2148. VPXOR Y5, Y10, Y5
  2149. VPSRLD $0x07, Y5, Y8
  2150. VPSLLD $0x19, Y5, Y5
  2151. VPOR Y5, Y8, Y5
  2152. VPADDD Y1, Y6, Y1
  2153. VPADDD 384(SP), Y1, Y1
  2154. VPXOR Y12, Y1, Y12
  2155. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2156. VPADDD Y11, Y12, Y11
  2157. VPXOR Y6, Y11, Y6
  2158. VPSRLD $0x0c, Y6, Y8
  2159. VPSLLD $0x14, Y6, Y6
  2160. VPOR Y6, Y8, Y6
  2161. VPADDD Y1, Y6, Y1
  2162. VPADDD 160(SP), Y1, Y1
  2163. VPXOR Y12, Y1, Y12
  2164. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2165. VPADDD Y11, Y12, Y11
  2166. VPXOR Y6, Y11, Y6
  2167. VPSRLD $0x07, Y6, Y8
  2168. VPSLLD $0x19, Y6, Y6
  2169. VPOR Y6, Y8, Y6
  2170. VPADDD Y2, Y7, Y2
  2171. VPADDD 288(SP), Y2, Y2
  2172. VPXOR Y13, Y2, Y13
  2173. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2174. VMOVDQU 512(SP), Y8
  2175. VPADDD Y8, Y13, Y8
  2176. VPXOR Y7, Y8, Y7
  2177. VMOVDQU Y8, 512(SP)
  2178. VPSRLD $0x0c, Y7, Y8
  2179. VPSLLD $0x14, Y7, Y7
  2180. VPOR Y7, Y8, Y7
  2181. VPADDD Y2, Y7, Y2
  2182. VPADDD 448(SP), Y2, Y2
  2183. VPXOR Y13, Y2, Y13
  2184. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2185. VMOVDQU 512(SP), Y8
  2186. VPADDD Y8, Y13, Y8
  2187. VPXOR Y7, Y8, Y7
  2188. VMOVDQU Y8, 512(SP)
  2189. VPSRLD $0x07, Y7, Y8
  2190. VPSLLD $0x19, Y7, Y7
  2191. VPOR Y7, Y8, Y7
  2192. VPADDD Y3, Y4, Y3
  2193. VPADDD 480(SP), Y3, Y3
  2194. VPXOR Y14, Y3, Y14
  2195. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2196. VPADDD Y9, Y14, Y9
  2197. VPXOR Y4, Y9, Y4
  2198. VPSRLD $0x0c, Y4, Y8
  2199. VPSLLD $0x14, Y4, Y4
  2200. VPOR Y4, Y8, Y4
  2201. VPADDD Y3, Y4, Y3
  2202. VPADDD 256(SP), Y3, Y3
  2203. VPXOR Y14, Y3, Y14
  2204. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2205. VPADDD Y9, Y14, Y9
  2206. VPXOR Y4, Y9, Y4
  2207. VPSRLD $0x07, Y4, Y8
  2208. VPSLLD $0x19, Y4, Y4
  2209. VPOR Y4, Y8, Y4
  2210. // Round 3
  2211. VPADDD Y0, Y4, Y0
  2212. VPADDD 96(SP), Y0, Y0
  2213. VPXOR Y12, Y0, Y12
  2214. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2215. VMOVDQU 512(SP), Y8
  2216. VPADDD Y8, Y12, Y8
  2217. VPXOR Y4, Y8, Y4
  2218. VMOVDQU Y8, 512(SP)
  2219. VPSRLD $0x0c, Y4, Y8
  2220. VPSLLD $0x14, Y4, Y4
  2221. VPOR Y4, Y8, Y4
  2222. VPADDD Y0, Y4, Y0
  2223. VPADDD 128(SP), Y0, Y0
  2224. VPXOR Y12, Y0, Y12
  2225. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2226. VMOVDQU 512(SP), Y8
  2227. VPADDD Y8, Y12, Y8
  2228. VPXOR Y4, Y8, Y4
  2229. VMOVDQU Y8, 512(SP)
  2230. VPSRLD $0x07, Y4, Y8
  2231. VPSLLD $0x19, Y4, Y4
  2232. VPOR Y4, Y8, Y4
  2233. VPADDD Y1, Y5, Y1
  2234. VPADDD 320(SP), Y1, Y1
  2235. VPXOR Y13, Y1, Y13
  2236. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2237. VPADDD Y9, Y13, Y9
  2238. VPXOR Y5, Y9, Y5
  2239. VPSRLD $0x0c, Y5, Y8
  2240. VPSLLD $0x14, Y5, Y5
  2241. VPOR Y5, Y8, Y5
  2242. VPADDD Y1, Y5, Y1
  2243. VPADDD 384(SP), Y1, Y1
  2244. VPXOR Y13, Y1, Y13
  2245. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2246. VPADDD Y9, Y13, Y9
  2247. VPXOR Y5, Y9, Y5
  2248. VPSRLD $0x07, Y5, Y8
  2249. VPSLLD $0x19, Y5, Y5
  2250. VPOR Y5, Y8, Y5
  2251. VPADDD Y2, Y6, Y2
  2252. VPADDD 416(SP), Y2, Y2
  2253. VPXOR Y14, Y2, Y14
  2254. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2255. VPADDD Y10, Y14, Y10
  2256. VPXOR Y6, Y10, Y6
  2257. VPSRLD $0x0c, Y6, Y8
  2258. VPSLLD $0x14, Y6, Y6
  2259. VPOR Y6, Y8, Y6
  2260. VPADDD Y2, Y6, Y2
  2261. VPADDD 64(SP), Y2, Y2
  2262. VPXOR Y14, Y2, Y14
  2263. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2264. VPADDD Y10, Y14, Y10
  2265. VPXOR Y6, Y10, Y6
  2266. VPSRLD $0x07, Y6, Y8
  2267. VPSLLD $0x19, Y6, Y6
  2268. VPOR Y6, Y8, Y6
  2269. VPADDD Y3, Y7, Y3
  2270. VPADDD 224(SP), Y3, Y3
  2271. VPXOR Y15, Y3, Y15
  2272. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2273. VPADDD Y11, Y15, Y11
  2274. VPXOR Y7, Y11, Y7
  2275. VPSRLD $0x0c, Y7, Y8
  2276. VPSLLD $0x14, Y7, Y7
  2277. VPOR Y7, Y8, Y7
  2278. VPADDD Y3, Y7, Y3
  2279. VPADDD 448(SP), Y3, Y3
  2280. VPXOR Y15, Y3, Y15
  2281. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2282. VPADDD Y11, Y15, Y11
  2283. VPXOR Y7, Y11, Y7
  2284. VPSRLD $0x07, Y7, Y8
  2285. VPSLLD $0x19, Y7, Y7
  2286. VPOR Y7, Y8, Y7
  2287. VPADDD Y0, Y5, Y0
  2288. VPADDD 192(SP), Y0, Y0
  2289. VPXOR Y15, Y0, Y15
  2290. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2291. VPADDD Y10, Y15, Y10
  2292. VPXOR Y5, Y10, Y5
  2293. VPSRLD $0x0c, Y5, Y8
  2294. VPSLLD $0x14, Y5, Y5
  2295. VPOR Y5, Y8, Y5
  2296. VPADDD Y0, Y5, Y0
  2297. VPADDD 160(SP), Y0, Y0
  2298. VPXOR Y15, Y0, Y15
  2299. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2300. VPADDD Y10, Y15, Y10
  2301. VPXOR Y5, Y10, Y5
  2302. VPSRLD $0x07, Y5, Y8
  2303. VPSLLD $0x19, Y5, Y5
  2304. VPOR Y5, Y8, Y5
  2305. VPADDD Y1, Y6, Y1
  2306. VPADDD 288(SP), Y1, Y1
  2307. VPXOR Y12, Y1, Y12
  2308. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2309. VPADDD Y11, Y12, Y11
  2310. VPXOR Y6, Y11, Y6
  2311. VPSRLD $0x0c, Y6, Y8
  2312. VPSLLD $0x14, Y6, Y6
  2313. VPOR Y6, Y8, Y6
  2314. VPADDD Y1, Y6, Y1
  2315. VPADDD (SP), Y1, Y1
  2316. VPXOR Y12, Y1, Y12
  2317. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2318. VPADDD Y11, Y12, Y11
  2319. VPXOR Y6, Y11, Y6
  2320. VPSRLD $0x07, Y6, Y8
  2321. VPSLLD $0x19, Y6, Y6
  2322. VPOR Y6, Y8, Y6
  2323. VPADDD Y2, Y7, Y2
  2324. VPADDD 352(SP), Y2, Y2
  2325. VPXOR Y13, Y2, Y13
  2326. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2327. VMOVDQU 512(SP), Y8
  2328. VPADDD Y8, Y13, Y8
  2329. VPXOR Y7, Y8, Y7
  2330. VMOVDQU Y8, 512(SP)
  2331. VPSRLD $0x0c, Y7, Y8
  2332. VPSLLD $0x14, Y7, Y7
  2333. VPOR Y7, Y8, Y7
  2334. VPADDD Y2, Y7, Y2
  2335. VPADDD 480(SP), Y2, Y2
  2336. VPXOR Y13, Y2, Y13
  2337. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2338. VMOVDQU 512(SP), Y8
  2339. VPADDD Y8, Y13, Y8
  2340. VPXOR Y7, Y8, Y7
  2341. VMOVDQU Y8, 512(SP)
  2342. VPSRLD $0x07, Y7, Y8
  2343. VPSLLD $0x19, Y7, Y7
  2344. VPOR Y7, Y8, Y7
  2345. VPADDD Y3, Y4, Y3
  2346. VPADDD 256(SP), Y3, Y3
  2347. VPXOR Y14, Y3, Y14
  2348. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2349. VPADDD Y9, Y14, Y9
  2350. VPXOR Y4, Y9, Y4
  2351. VPSRLD $0x0c, Y4, Y8
  2352. VPSLLD $0x14, Y4, Y4
  2353. VPOR Y4, Y8, Y4
  2354. VPADDD Y3, Y4, Y3
  2355. VPADDD 32(SP), Y3, Y3
  2356. VPXOR Y14, Y3, Y14
  2357. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2358. VPADDD Y9, Y14, Y9
  2359. VPXOR Y4, Y9, Y4
  2360. VPSRLD $0x07, Y4, Y8
  2361. VPSLLD $0x19, Y4, Y4
  2362. VPOR Y4, Y8, Y4
  2363. // Round 4
  2364. VPADDD Y0, Y4, Y0
  2365. VPADDD 320(SP), Y0, Y0
  2366. VPXOR Y12, Y0, Y12
  2367. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2368. VMOVDQU 512(SP), Y8
  2369. VPADDD Y8, Y12, Y8
  2370. VPXOR Y4, Y8, Y4
  2371. VMOVDQU Y8, 512(SP)
  2372. VPSRLD $0x0c, Y4, Y8
  2373. VPSLLD $0x14, Y4, Y4
  2374. VPOR Y4, Y8, Y4
  2375. VPADDD Y0, Y4, Y0
  2376. VPADDD 224(SP), Y0, Y0
  2377. VPXOR Y12, Y0, Y12
  2378. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2379. VMOVDQU 512(SP), Y8
  2380. VPADDD Y8, Y12, Y8
  2381. VPXOR Y4, Y8, Y4
  2382. VMOVDQU Y8, 512(SP)
  2383. VPSRLD $0x07, Y4, Y8
  2384. VPSLLD $0x19, Y4, Y4
  2385. VPOR Y4, Y8, Y4
  2386. VPADDD Y1, Y5, Y1
  2387. VPADDD 384(SP), Y1, Y1
  2388. VPXOR Y13, Y1, Y13
  2389. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2390. VPADDD Y9, Y13, Y9
  2391. VPXOR Y5, Y9, Y5
  2392. VPSRLD $0x0c, Y5, Y8
  2393. VPSLLD $0x14, Y5, Y5
  2394. VPOR Y5, Y8, Y5
  2395. VPADDD Y1, Y5, Y1
  2396. VPADDD 288(SP), Y1, Y1
  2397. VPXOR Y13, Y1, Y13
  2398. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2399. VPADDD Y9, Y13, Y9
  2400. VPXOR Y5, Y9, Y5
  2401. VPSRLD $0x07, Y5, Y8
  2402. VPSLLD $0x19, Y5, Y5
  2403. VPOR Y5, Y8, Y5
  2404. VPADDD Y2, Y6, Y2
  2405. VPADDD 448(SP), Y2, Y2
  2406. VPXOR Y14, Y2, Y14
  2407. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2408. VPADDD Y10, Y14, Y10
  2409. VPXOR Y6, Y10, Y6
  2410. VPSRLD $0x0c, Y6, Y8
  2411. VPSLLD $0x14, Y6, Y6
  2412. VPOR Y6, Y8, Y6
  2413. VPADDD Y2, Y6, Y2
  2414. VPADDD 96(SP), Y2, Y2
  2415. VPXOR Y14, Y2, Y14
  2416. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2417. VPADDD Y10, Y14, Y10
  2418. VPXOR Y6, Y10, Y6
  2419. VPSRLD $0x07, Y6, Y8
  2420. VPSLLD $0x19, Y6, Y6
  2421. VPOR Y6, Y8, Y6
  2422. VPADDD Y3, Y7, Y3
  2423. VPADDD 416(SP), Y3, Y3
  2424. VPXOR Y15, Y3, Y15
  2425. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2426. VPADDD Y11, Y15, Y11
  2427. VPXOR Y7, Y11, Y7
  2428. VPSRLD $0x0c, Y7, Y8
  2429. VPSLLD $0x14, Y7, Y7
  2430. VPOR Y7, Y8, Y7
  2431. VPADDD Y3, Y7, Y3
  2432. VPADDD 480(SP), Y3, Y3
  2433. VPXOR Y15, Y3, Y15
  2434. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2435. VPADDD Y11, Y15, Y11
  2436. VPXOR Y7, Y11, Y7
  2437. VPSRLD $0x07, Y7, Y8
  2438. VPSLLD $0x19, Y7, Y7
  2439. VPOR Y7, Y8, Y7
  2440. VPADDD Y0, Y5, Y0
  2441. VPADDD 128(SP), Y0, Y0
  2442. VPXOR Y15, Y0, Y15
  2443. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2444. VPADDD Y10, Y15, Y10
  2445. VPXOR Y5, Y10, Y5
  2446. VPSRLD $0x0c, Y5, Y8
  2447. VPSLLD $0x14, Y5, Y5
  2448. VPOR Y5, Y8, Y5
  2449. VPADDD Y0, Y5, Y0
  2450. VPADDD (SP), Y0, Y0
  2451. VPXOR Y15, Y0, Y15
  2452. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2453. VPADDD Y10, Y15, Y10
  2454. VPXOR Y5, Y10, Y5
  2455. VPSRLD $0x07, Y5, Y8
  2456. VPSLLD $0x19, Y5, Y5
  2457. VPOR Y5, Y8, Y5
  2458. VPADDD Y1, Y6, Y1
  2459. VPADDD 352(SP), Y1, Y1
  2460. VPXOR Y12, Y1, Y12
  2461. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2462. VPADDD Y11, Y12, Y11
  2463. VPXOR Y6, Y11, Y6
  2464. VPSRLD $0x0c, Y6, Y8
  2465. VPSLLD $0x14, Y6, Y6
  2466. VPOR Y6, Y8, Y6
  2467. VPADDD Y1, Y6, Y1
  2468. VPADDD 64(SP), Y1, Y1
  2469. VPXOR Y12, Y1, Y12
  2470. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2471. VPADDD Y11, Y12, Y11
  2472. VPXOR Y6, Y11, Y6
  2473. VPSRLD $0x07, Y6, Y8
  2474. VPSLLD $0x19, Y6, Y6
  2475. VPOR Y6, Y8, Y6
  2476. VPADDD Y2, Y7, Y2
  2477. VPADDD 160(SP), Y2, Y2
  2478. VPXOR Y13, Y2, Y13
  2479. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2480. VMOVDQU 512(SP), Y8
  2481. VPADDD Y8, Y13, Y8
  2482. VPXOR Y7, Y8, Y7
  2483. VMOVDQU Y8, 512(SP)
  2484. VPSRLD $0x0c, Y7, Y8
  2485. VPSLLD $0x14, Y7, Y7
  2486. VPOR Y7, Y8, Y7
  2487. VPADDD Y2, Y7, Y2
  2488. VPADDD 256(SP), Y2, Y2
  2489. VPXOR Y13, Y2, Y13
  2490. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2491. VMOVDQU 512(SP), Y8
  2492. VPADDD Y8, Y13, Y8
  2493. VPXOR Y7, Y8, Y7
  2494. VMOVDQU Y8, 512(SP)
  2495. VPSRLD $0x07, Y7, Y8
  2496. VPSLLD $0x19, Y7, Y7
  2497. VPOR Y7, Y8, Y7
  2498. VPADDD Y3, Y4, Y3
  2499. VPADDD 32(SP), Y3, Y3
  2500. VPXOR Y14, Y3, Y14
  2501. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2502. VPADDD Y9, Y14, Y9
  2503. VPXOR Y4, Y9, Y4
  2504. VPSRLD $0x0c, Y4, Y8
  2505. VPSLLD $0x14, Y4, Y4
  2506. VPOR Y4, Y8, Y4
  2507. VPADDD Y3, Y4, Y3
  2508. VPADDD 192(SP), Y3, Y3
  2509. VPXOR Y14, Y3, Y14
  2510. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2511. VPADDD Y9, Y14, Y9
  2512. VPXOR Y4, Y9, Y4
  2513. VPSRLD $0x07, Y4, Y8
  2514. VPSLLD $0x19, Y4, Y4
  2515. VPOR Y4, Y8, Y4
  2516. // Round 5
  2517. VPADDD Y0, Y4, Y0
  2518. VPADDD 384(SP), Y0, Y0
  2519. VPXOR Y12, Y0, Y12
  2520. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2521. VMOVDQU 512(SP), Y8
  2522. VPADDD Y8, Y12, Y8
  2523. VPXOR Y4, Y8, Y4
  2524. VMOVDQU Y8, 512(SP)
  2525. VPSRLD $0x0c, Y4, Y8
  2526. VPSLLD $0x14, Y4, Y4
  2527. VPOR Y4, Y8, Y4
  2528. VPADDD Y0, Y4, Y0
  2529. VPADDD 416(SP), Y0, Y0
  2530. VPXOR Y12, Y0, Y12
  2531. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2532. VMOVDQU 512(SP), Y8
  2533. VPADDD Y8, Y12, Y8
  2534. VPXOR Y4, Y8, Y4
  2535. VMOVDQU Y8, 512(SP)
  2536. VPSRLD $0x07, Y4, Y8
  2537. VPSLLD $0x19, Y4, Y4
  2538. VPOR Y4, Y8, Y4
  2539. VPADDD Y1, Y5, Y1
  2540. VPADDD 288(SP), Y1, Y1
  2541. VPXOR Y13, Y1, Y13
  2542. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2543. VPADDD Y9, Y13, Y9
  2544. VPXOR Y5, Y9, Y5
  2545. VPSRLD $0x0c, Y5, Y8
  2546. VPSLLD $0x14, Y5, Y5
  2547. VPOR Y5, Y8, Y5
  2548. VPADDD Y1, Y5, Y1
  2549. VPADDD 352(SP), Y1, Y1
  2550. VPXOR Y13, Y1, Y13
  2551. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2552. VPADDD Y9, Y13, Y9
  2553. VPXOR Y5, Y9, Y5
  2554. VPSRLD $0x07, Y5, Y8
  2555. VPSLLD $0x19, Y5, Y5
  2556. VPOR Y5, Y8, Y5
  2557. VPADDD Y2, Y6, Y2
  2558. VPADDD 480(SP), Y2, Y2
  2559. VPXOR Y14, Y2, Y14
  2560. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2561. VPADDD Y10, Y14, Y10
  2562. VPXOR Y6, Y10, Y6
  2563. VPSRLD $0x0c, Y6, Y8
  2564. VPSLLD $0x14, Y6, Y6
  2565. VPOR Y6, Y8, Y6
  2566. VPADDD Y2, Y6, Y2
  2567. VPADDD 320(SP), Y2, Y2
  2568. VPXOR Y14, Y2, Y14
  2569. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2570. VPADDD Y10, Y14, Y10
  2571. VPXOR Y6, Y10, Y6
  2572. VPSRLD $0x07, Y6, Y8
  2573. VPSLLD $0x19, Y6, Y6
  2574. VPOR Y6, Y8, Y6
  2575. VPADDD Y3, Y7, Y3
  2576. VPADDD 448(SP), Y3, Y3
  2577. VPXOR Y15, Y3, Y15
  2578. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2579. VPADDD Y11, Y15, Y11
  2580. VPXOR Y7, Y11, Y7
  2581. VPSRLD $0x0c, Y7, Y8
  2582. VPSLLD $0x14, Y7, Y7
  2583. VPOR Y7, Y8, Y7
  2584. VPADDD Y3, Y7, Y3
  2585. VPADDD 256(SP), Y3, Y3
  2586. VPXOR Y15, Y3, Y15
  2587. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2588. VPADDD Y11, Y15, Y11
  2589. VPXOR Y7, Y11, Y7
  2590. VPSRLD $0x07, Y7, Y8
  2591. VPSLLD $0x19, Y7, Y7
  2592. VPOR Y7, Y8, Y7
  2593. VPADDD Y0, Y5, Y0
  2594. VPADDD 224(SP), Y0, Y0
  2595. VPXOR Y15, Y0, Y15
  2596. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2597. VPADDD Y10, Y15, Y10
  2598. VPXOR Y5, Y10, Y5
  2599. VPSRLD $0x0c, Y5, Y8
  2600. VPSLLD $0x14, Y5, Y5
  2601. VPOR Y5, Y8, Y5
  2602. VPADDD Y0, Y5, Y0
  2603. VPADDD 64(SP), Y0, Y0
  2604. VPXOR Y15, Y0, Y15
  2605. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2606. VPADDD Y10, Y15, Y10
  2607. VPXOR Y5, Y10, Y5
  2608. VPSRLD $0x07, Y5, Y8
  2609. VPSLLD $0x19, Y5, Y5
  2610. VPOR Y5, Y8, Y5
  2611. VPADDD Y1, Y6, Y1
  2612. VPADDD 160(SP), Y1, Y1
  2613. VPXOR Y12, Y1, Y12
  2614. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2615. VPADDD Y11, Y12, Y11
  2616. VPXOR Y6, Y11, Y6
  2617. VPSRLD $0x0c, Y6, Y8
  2618. VPSLLD $0x14, Y6, Y6
  2619. VPOR Y6, Y8, Y6
  2620. VPADDD Y1, Y6, Y1
  2621. VPADDD 96(SP), Y1, Y1
  2622. VPXOR Y12, Y1, Y12
  2623. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2624. VPADDD Y11, Y12, Y11
  2625. VPXOR Y6, Y11, Y6
  2626. VPSRLD $0x07, Y6, Y8
  2627. VPSLLD $0x19, Y6, Y6
  2628. VPOR Y6, Y8, Y6
  2629. VPADDD Y2, Y7, Y2
  2630. VPADDD (SP), Y2, Y2
  2631. VPXOR Y13, Y2, Y13
  2632. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2633. VMOVDQU 512(SP), Y8
  2634. VPADDD Y8, Y13, Y8
  2635. VPXOR Y7, Y8, Y7
  2636. VMOVDQU Y8, 512(SP)
  2637. VPSRLD $0x0c, Y7, Y8
  2638. VPSLLD $0x14, Y7, Y7
  2639. VPOR Y7, Y8, Y7
  2640. VPADDD Y2, Y7, Y2
  2641. VPADDD 32(SP), Y2, Y2
  2642. VPXOR Y13, Y2, Y13
  2643. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2644. VMOVDQU 512(SP), Y8
  2645. VPADDD Y8, Y13, Y8
  2646. VPXOR Y7, Y8, Y7
  2647. VMOVDQU Y8, 512(SP)
  2648. VPSRLD $0x07, Y7, Y8
  2649. VPSLLD $0x19, Y7, Y7
  2650. VPOR Y7, Y8, Y7
  2651. VPADDD Y3, Y4, Y3
  2652. VPADDD 192(SP), Y3, Y3
  2653. VPXOR Y14, Y3, Y14
  2654. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2655. VPADDD Y9, Y14, Y9
  2656. VPXOR Y4, Y9, Y4
  2657. VPSRLD $0x0c, Y4, Y8
  2658. VPSLLD $0x14, Y4, Y4
  2659. VPOR Y4, Y8, Y4
  2660. VPADDD Y3, Y4, Y3
  2661. VPADDD 128(SP), Y3, Y3
  2662. VPXOR Y14, Y3, Y14
  2663. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2664. VPADDD Y9, Y14, Y9
  2665. VPXOR Y4, Y9, Y4
  2666. VPSRLD $0x07, Y4, Y8
  2667. VPSLLD $0x19, Y4, Y4
  2668. VPOR Y4, Y8, Y4
  2669. // Round 6
  2670. VPADDD Y0, Y4, Y0
  2671. VPADDD 288(SP), Y0, Y0
  2672. VPXOR Y12, Y0, Y12
  2673. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2674. VMOVDQU 512(SP), Y8
  2675. VPADDD Y8, Y12, Y8
  2676. VPXOR Y4, Y8, Y4
  2677. VMOVDQU Y8, 512(SP)
  2678. VPSRLD $0x0c, Y4, Y8
  2679. VPSLLD $0x14, Y4, Y4
  2680. VPOR Y4, Y8, Y4
  2681. VPADDD Y0, Y4, Y0
  2682. VPADDD 448(SP), Y0, Y0
  2683. VPXOR Y12, Y0, Y12
  2684. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2685. VMOVDQU 512(SP), Y8
  2686. VPADDD Y8, Y12, Y8
  2687. VPXOR Y4, Y8, Y4
  2688. VMOVDQU Y8, 512(SP)
  2689. VPSRLD $0x07, Y4, Y8
  2690. VPSLLD $0x19, Y4, Y4
  2691. VPOR Y4, Y8, Y4
  2692. VPADDD Y1, Y5, Y1
  2693. VPADDD 352(SP), Y1, Y1
  2694. VPXOR Y13, Y1, Y13
  2695. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2696. VPADDD Y9, Y13, Y9
  2697. VPXOR Y5, Y9, Y5
  2698. VPSRLD $0x0c, Y5, Y8
  2699. VPSLLD $0x14, Y5, Y5
  2700. VPOR Y5, Y8, Y5
  2701. VPADDD Y1, Y5, Y1
  2702. VPADDD 160(SP), Y1, Y1
  2703. VPXOR Y13, Y1, Y13
  2704. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2705. VPADDD Y9, Y13, Y9
  2706. VPXOR Y5, Y9, Y5
  2707. VPSRLD $0x07, Y5, Y8
  2708. VPSLLD $0x19, Y5, Y5
  2709. VPOR Y5, Y8, Y5
  2710. VPADDD Y2, Y6, Y2
  2711. VPADDD 256(SP), Y2, Y2
  2712. VPXOR Y14, Y2, Y14
  2713. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2714. VPADDD Y10, Y14, Y10
  2715. VPXOR Y6, Y10, Y6
  2716. VPSRLD $0x0c, Y6, Y8
  2717. VPSLLD $0x14, Y6, Y6
  2718. VPOR Y6, Y8, Y6
  2719. VPADDD Y2, Y6, Y2
  2720. VPADDD 384(SP), Y2, Y2
  2721. VPXOR Y14, Y2, Y14
  2722. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2723. VPADDD Y10, Y14, Y10
  2724. VPXOR Y6, Y10, Y6
  2725. VPSRLD $0x07, Y6, Y8
  2726. VPSLLD $0x19, Y6, Y6
  2727. VPOR Y6, Y8, Y6
  2728. VPADDD Y3, Y7, Y3
  2729. VPADDD 480(SP), Y3, Y3
  2730. VPXOR Y15, Y3, Y15
  2731. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2732. VPADDD Y11, Y15, Y11
  2733. VPXOR Y7, Y11, Y7
  2734. VPSRLD $0x0c, Y7, Y8
  2735. VPSLLD $0x14, Y7, Y7
  2736. VPOR Y7, Y8, Y7
  2737. VPADDD Y3, Y7, Y3
  2738. VPADDD 32(SP), Y3, Y3
  2739. VPXOR Y15, Y3, Y15
  2740. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2741. VPADDD Y11, Y15, Y11
  2742. VPXOR Y7, Y11, Y7
  2743. VPSRLD $0x07, Y7, Y8
  2744. VPSLLD $0x19, Y7, Y7
  2745. VPOR Y7, Y8, Y7
  2746. VPADDD Y0, Y5, Y0
  2747. VPADDD 416(SP), Y0, Y0
  2748. VPXOR Y15, Y0, Y15
  2749. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2750. VPADDD Y10, Y15, Y10
  2751. VPXOR Y5, Y10, Y5
  2752. VPSRLD $0x0c, Y5, Y8
  2753. VPSLLD $0x14, Y5, Y5
  2754. VPOR Y5, Y8, Y5
  2755. VPADDD Y0, Y5, Y0
  2756. VPADDD 96(SP), Y0, Y0
  2757. VPXOR Y15, Y0, Y15
  2758. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2759. VPADDD Y10, Y15, Y10
  2760. VPXOR Y5, Y10, Y5
  2761. VPSRLD $0x07, Y5, Y8
  2762. VPSLLD $0x19, Y5, Y5
  2763. VPOR Y5, Y8, Y5
  2764. VPADDD Y1, Y6, Y1
  2765. VPADDD (SP), Y1, Y1
  2766. VPXOR Y12, Y1, Y12
  2767. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2768. VPADDD Y11, Y12, Y11
  2769. VPXOR Y6, Y11, Y6
  2770. VPSRLD $0x0c, Y6, Y8
  2771. VPSLLD $0x14, Y6, Y6
  2772. VPOR Y6, Y8, Y6
  2773. VPADDD Y1, Y6, Y1
  2774. VPADDD 320(SP), Y1, Y1
  2775. VPXOR Y12, Y1, Y12
  2776. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2777. VPADDD Y11, Y12, Y11
  2778. VPXOR Y6, Y11, Y6
  2779. VPSRLD $0x07, Y6, Y8
  2780. VPSLLD $0x19, Y6, Y6
  2781. VPOR Y6, Y8, Y6
  2782. VPADDD Y2, Y7, Y2
  2783. VPADDD 64(SP), Y2, Y2
  2784. VPXOR Y13, Y2, Y13
  2785. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2786. VMOVDQU 512(SP), Y8
  2787. VPADDD Y8, Y13, Y8
  2788. VPXOR Y7, Y8, Y7
  2789. VMOVDQU Y8, 512(SP)
  2790. VPSRLD $0x0c, Y7, Y8
  2791. VPSLLD $0x14, Y7, Y7
  2792. VPOR Y7, Y8, Y7
  2793. VPADDD Y2, Y7, Y2
  2794. VPADDD 192(SP), Y2, Y2
  2795. VPXOR Y13, Y2, Y13
  2796. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2797. VMOVDQU 512(SP), Y8
  2798. VPADDD Y8, Y13, Y8
  2799. VPXOR Y7, Y8, Y7
  2800. VMOVDQU Y8, 512(SP)
  2801. VPSRLD $0x07, Y7, Y8
  2802. VPSLLD $0x19, Y7, Y7
  2803. VPOR Y7, Y8, Y7
  2804. VPADDD Y3, Y4, Y3
  2805. VPADDD 128(SP), Y3, Y3
  2806. VPXOR Y14, Y3, Y14
  2807. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2808. VPADDD Y9, Y14, Y9
  2809. VPXOR Y4, Y9, Y4
  2810. VPSRLD $0x0c, Y4, Y8
  2811. VPSLLD $0x14, Y4, Y4
  2812. VPOR Y4, Y8, Y4
  2813. VPADDD Y3, Y4, Y3
  2814. VPADDD 224(SP), Y3, Y3
  2815. VPXOR Y14, Y3, Y14
  2816. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2817. VPADDD Y9, Y14, Y9
  2818. VPXOR Y4, Y9, Y4
  2819. VPSRLD $0x07, Y4, Y8
  2820. VPSLLD $0x19, Y4, Y4
  2821. VPOR Y4, Y8, Y4
  2822. // Round 7
  2823. VPADDD Y0, Y4, Y0
  2824. VPADDD 352(SP), Y0, Y0
  2825. VPXOR Y12, Y0, Y12
  2826. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2827. VMOVDQU 512(SP), Y8
  2828. VPADDD Y8, Y12, Y8
  2829. VPXOR Y4, Y8, Y4
  2830. VMOVDQU Y8, 512(SP)
  2831. VPSRLD $0x0c, Y4, Y8
  2832. VPSLLD $0x14, Y4, Y4
  2833. VPOR Y4, Y8, Y4
  2834. VPADDD Y0, Y4, Y0
  2835. VPADDD 480(SP), Y0, Y0
  2836. VPXOR Y12, Y0, Y12
  2837. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2838. VMOVDQU 512(SP), Y8
  2839. VPADDD Y8, Y12, Y8
  2840. VPXOR Y4, Y8, Y4
  2841. VMOVDQU Y8, 512(SP)
  2842. VPSRLD $0x07, Y4, Y8
  2843. VPSLLD $0x19, Y4, Y4
  2844. VPOR Y4, Y8, Y4
  2845. VPADDD Y1, Y5, Y1
  2846. VPADDD 160(SP), Y1, Y1
  2847. VPXOR Y13, Y1, Y13
  2848. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2849. VPADDD Y9, Y13, Y9
  2850. VPXOR Y5, Y9, Y5
  2851. VPSRLD $0x0c, Y5, Y8
  2852. VPSLLD $0x14, Y5, Y5
  2853. VPOR Y5, Y8, Y5
  2854. VPADDD Y1, Y5, Y1
  2855. VPADDD (SP), Y1, Y1
  2856. VPXOR Y13, Y1, Y13
  2857. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2858. VPADDD Y9, Y13, Y9
  2859. VPXOR Y5, Y9, Y5
  2860. VPSRLD $0x07, Y5, Y8
  2861. VPSLLD $0x19, Y5, Y5
  2862. VPOR Y5, Y8, Y5
  2863. VPADDD Y2, Y6, Y2
  2864. VPADDD 32(SP), Y2, Y2
  2865. VPXOR Y14, Y2, Y14
  2866. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2867. VPADDD Y10, Y14, Y10
  2868. VPXOR Y6, Y10, Y6
  2869. VPSRLD $0x0c, Y6, Y8
  2870. VPSLLD $0x14, Y6, Y6
  2871. VPOR Y6, Y8, Y6
  2872. VPADDD Y2, Y6, Y2
  2873. VPADDD 288(SP), Y2, Y2
  2874. VPXOR Y14, Y2, Y14
  2875. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2876. VPADDD Y10, Y14, Y10
  2877. VPXOR Y6, Y10, Y6
  2878. VPSRLD $0x07, Y6, Y8
  2879. VPSLLD $0x19, Y6, Y6
  2880. VPOR Y6, Y8, Y6
  2881. VPADDD Y3, Y7, Y3
  2882. VPADDD 256(SP), Y3, Y3
  2883. VPXOR Y15, Y3, Y15
  2884. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2885. VPADDD Y11, Y15, Y11
  2886. VPXOR Y7, Y11, Y7
  2887. VPSRLD $0x0c, Y7, Y8
  2888. VPSLLD $0x14, Y7, Y7
  2889. VPOR Y7, Y8, Y7
  2890. VPADDD Y3, Y7, Y3
  2891. VPADDD 192(SP), Y3, Y3
  2892. VPXOR Y15, Y3, Y15
  2893. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2894. VPADDD Y11, Y15, Y11
  2895. VPXOR Y7, Y11, Y7
  2896. VPSRLD $0x07, Y7, Y8
  2897. VPSLLD $0x19, Y7, Y7
  2898. VPOR Y7, Y8, Y7
  2899. VPADDD Y0, Y5, Y0
  2900. VPADDD 448(SP), Y0, Y0
  2901. VPXOR Y15, Y0, Y15
  2902. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  2903. VPADDD Y10, Y15, Y10
  2904. VPXOR Y5, Y10, Y5
  2905. VPSRLD $0x0c, Y5, Y8
  2906. VPSLLD $0x14, Y5, Y5
  2907. VPOR Y5, Y8, Y5
  2908. VPADDD Y0, Y5, Y0
  2909. VPADDD 320(SP), Y0, Y0
  2910. VPXOR Y15, Y0, Y15
  2911. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  2912. VPADDD Y10, Y15, Y10
  2913. VPXOR Y5, Y10, Y5
  2914. VPSRLD $0x07, Y5, Y8
  2915. VPSLLD $0x19, Y5, Y5
  2916. VPOR Y5, Y8, Y5
  2917. VPADDD Y1, Y6, Y1
  2918. VPADDD 64(SP), Y1, Y1
  2919. VPXOR Y12, Y1, Y12
  2920. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  2921. VPADDD Y11, Y12, Y11
  2922. VPXOR Y6, Y11, Y6
  2923. VPSRLD $0x0c, Y6, Y8
  2924. VPSLLD $0x14, Y6, Y6
  2925. VPOR Y6, Y8, Y6
  2926. VPADDD Y1, Y6, Y1
  2927. VPADDD 384(SP), Y1, Y1
  2928. VPXOR Y12, Y1, Y12
  2929. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  2930. VPADDD Y11, Y12, Y11
  2931. VPXOR Y6, Y11, Y6
  2932. VPSRLD $0x07, Y6, Y8
  2933. VPSLLD $0x19, Y6, Y6
  2934. VPOR Y6, Y8, Y6
  2935. VPADDD Y2, Y7, Y2
  2936. VPADDD 96(SP), Y2, Y2
  2937. VPXOR Y13, Y2, Y13
  2938. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  2939. VMOVDQU 512(SP), Y8
  2940. VPADDD Y8, Y13, Y8
  2941. VPXOR Y7, Y8, Y7
  2942. VMOVDQU Y8, 512(SP)
  2943. VPSRLD $0x0c, Y7, Y8
  2944. VPSLLD $0x14, Y7, Y7
  2945. VPOR Y7, Y8, Y7
  2946. VPADDD Y2, Y7, Y2
  2947. VPADDD 128(SP), Y2, Y2
  2948. VPXOR Y13, Y2, Y13
  2949. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  2950. VMOVDQU 512(SP), Y8
  2951. VPADDD Y8, Y13, Y8
  2952. VPXOR Y7, Y8, Y7
  2953. VMOVDQU Y8, 512(SP)
  2954. VPSRLD $0x07, Y7, Y8
  2955. VPSLLD $0x19, Y7, Y7
  2956. VPOR Y7, Y8, Y7
  2957. VPADDD Y3, Y4, Y3
  2958. VPADDD 224(SP), Y3, Y3
  2959. VPXOR Y14, Y3, Y14
  2960. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  2961. VPADDD Y9, Y14, Y9
  2962. VPXOR Y4, Y9, Y4
  2963. VPSRLD $0x0c, Y4, Y8
  2964. VPSLLD $0x14, Y4, Y4
  2965. VPOR Y4, Y8, Y4
  2966. VPADDD Y3, Y4, Y3
  2967. VPADDD 416(SP), Y3, Y3
  2968. VPXOR Y14, Y3, Y14
  2969. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  2970. VPADDD Y9, Y14, Y9
  2971. VPXOR Y4, Y9, Y4
  2972. VPSRLD $0x07, Y4, Y8
  2973. VPSLLD $0x19, Y4, Y4
  2974. VPOR Y4, Y8, Y4
  2975. VMOVDQU 512(SP), Y8
  2976. // Finalize CVs
  2977. VMOVDQU Y8, 256(SP)
  2978. VMOVDQU Y9, 288(SP)
  2979. VMOVDQU Y10, 320(SP)
  2980. VMOVDQU Y11, 352(SP)
  2981. VMOVDQU Y12, 384(SP)
  2982. VMOVDQU Y13, 416(SP)
  2983. VMOVDQU Y14, 448(SP)
  2984. VMOVDQU Y15, 480(SP)
  2985. VPXOR Y0, Y8, Y0
  2986. VPXOR Y1, Y9, Y1
  2987. VPXOR Y2, Y10, Y2
  2988. VPXOR Y3, Y11, Y3
  2989. VPXOR Y4, Y12, Y4
  2990. VPXOR Y5, Y13, Y5
  2991. VPXOR Y6, Y14, Y6
  2992. VPXOR Y7, Y15, Y7
  2993. VPUNPCKLDQ Y1, Y0, Y8
  2994. VPUNPCKHDQ Y1, Y0, Y9
  2995. VPUNPCKLDQ Y3, Y2, Y10
  2996. VPUNPCKHDQ Y3, Y2, Y11
  2997. VPUNPCKLDQ Y5, Y4, Y12
  2998. VPUNPCKHDQ Y5, Y4, Y13
  2999. VPUNPCKLDQ Y7, Y6, Y14
  3000. VPUNPCKHDQ Y7, Y6, Y15
  3001. VPUNPCKLQDQ Y10, Y8, Y0
  3002. VPUNPCKHQDQ Y10, Y8, Y1
  3003. VPUNPCKLQDQ Y11, Y9, Y2
  3004. VPUNPCKHQDQ Y11, Y9, Y3
  3005. VPUNPCKLQDQ Y14, Y12, Y4
  3006. VPUNPCKHQDQ Y14, Y12, Y5
  3007. VPUNPCKLQDQ Y15, Y13, Y6
  3008. VPUNPCKHQDQ Y15, Y13, Y7
  3009. VPERM2I128 $0x20, Y4, Y0, Y8
  3010. VPERM2I128 $0x31, Y4, Y0, Y12
  3011. VPERM2I128 $0x20, Y5, Y1, Y9
  3012. VPERM2I128 $0x31, Y5, Y1, Y13
  3013. VPERM2I128 $0x20, Y6, Y2, Y10
  3014. VPERM2I128 $0x31, Y6, Y2, Y14
  3015. VPERM2I128 $0x20, Y7, Y3, Y11
  3016. VPERM2I128 $0x31, Y7, Y3, Y15
  3017. VMOVDQU Y8, (AX)
  3018. VMOVDQU Y9, 64(AX)
  3019. VMOVDQU Y10, 128(AX)
  3020. VMOVDQU Y11, 192(AX)
  3021. VMOVDQU Y12, 256(AX)
  3022. VMOVDQU Y13, 320(AX)
  3023. VMOVDQU Y14, 384(AX)
  3024. VMOVDQU Y15, 448(AX)
  3025. VMOVDQU 256(SP), Y8
  3026. VMOVDQU 288(SP), Y9
  3027. VMOVDQU 320(SP), Y10
  3028. VMOVDQU 352(SP), Y11
  3029. VMOVDQU 384(SP), Y12
  3030. VMOVDQU 416(SP), Y13
  3031. VMOVDQU 448(SP), Y14
  3032. VMOVDQU 480(SP), Y15
  3033. VPBROADCASTD (DX), Y0
  3034. VPXOR Y0, Y8, Y8
  3035. VPBROADCASTD 4(DX), Y0
  3036. VPXOR Y0, Y9, Y9
  3037. VPBROADCASTD 8(DX), Y0
  3038. VPXOR Y0, Y10, Y10
  3039. VPBROADCASTD 12(DX), Y0
  3040. VPXOR Y0, Y11, Y11
  3041. VPBROADCASTD 16(DX), Y0
  3042. VPXOR Y0, Y12, Y12
  3043. VPBROADCASTD 20(DX), Y0
  3044. VPXOR Y0, Y13, Y13
  3045. VPBROADCASTD 24(DX), Y0
  3046. VPXOR Y0, Y14, Y14
  3047. VPBROADCASTD 28(DX), Y0
  3048. VPXOR Y0, Y15, Y15
  3049. VPUNPCKLDQ Y9, Y8, Y0
  3050. VPUNPCKHDQ Y9, Y8, Y1
  3051. VPUNPCKLDQ Y11, Y10, Y2
  3052. VPUNPCKHDQ Y11, Y10, Y3
  3053. VPUNPCKLDQ Y13, Y12, Y4
  3054. VPUNPCKHDQ Y13, Y12, Y5
  3055. VPUNPCKLDQ Y15, Y14, Y6
  3056. VPUNPCKHDQ Y15, Y14, Y7
  3057. VPUNPCKLQDQ Y2, Y0, Y8
  3058. VPUNPCKHQDQ Y2, Y0, Y9
  3059. VPUNPCKLQDQ Y3, Y1, Y10
  3060. VPUNPCKHQDQ Y3, Y1, Y11
  3061. VPUNPCKLQDQ Y6, Y4, Y12
  3062. VPUNPCKHQDQ Y6, Y4, Y13
  3063. VPUNPCKLQDQ Y7, Y5, Y14
  3064. VPUNPCKHQDQ Y7, Y5, Y15
  3065. VPERM2I128 $0x20, Y12, Y8, Y0
  3066. VPERM2I128 $0x31, Y12, Y8, Y4
  3067. VPERM2I128 $0x20, Y13, Y9, Y1
  3068. VPERM2I128 $0x31, Y13, Y9, Y5
  3069. VPERM2I128 $0x20, Y14, Y10, Y2
  3070. VPERM2I128 $0x31, Y14, Y10, Y6
  3071. VPERM2I128 $0x20, Y15, Y11, Y3
  3072. VPERM2I128 $0x31, Y15, Y11, Y7
  3073. VMOVDQU Y0, 32(AX)
  3074. VMOVDQU Y1, 96(AX)
  3075. VMOVDQU Y2, 160(AX)
  3076. VMOVDQU Y3, 224(AX)
  3077. VMOVDQU Y4, 288(AX)
  3078. VMOVDQU Y5, 352(AX)
  3079. VMOVDQU Y6, 416(AX)
  3080. VMOVDQU Y7, 480(AX)
  3081. RET
  3082. // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
  3083. // Requires: AVX, AVX2
  3084. TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40
  3085. MOVQ cvs+0(FP), AX
  3086. MOVQ buf+8(FP), CX
  3087. MOVQ key+16(FP), DX
  3088. // Load key
  3089. VPBROADCASTD (DX), Y0
  3090. VPBROADCASTD 4(DX), Y1
  3091. VPBROADCASTD 8(DX), Y2
  3092. VPBROADCASTD 12(DX), Y3
  3093. VPBROADCASTD 16(DX), Y4
  3094. VPBROADCASTD 20(DX), Y5
  3095. VPBROADCASTD 24(DX), Y6
  3096. VPBROADCASTD 28(DX), Y7
  3097. // Initialize counter
  3098. VPBROADCASTQ counter+24(FP), Y12
  3099. VPBROADCASTQ counter+24(FP), Y13
  3100. VPADDQ seq64<>+0(SB), Y12, Y12
  3101. VPADDQ seq64<>+32(SB), Y13, Y13
  3102. VPUNPCKLDQ Y13, Y12, Y14
  3103. VPUNPCKHDQ Y13, Y12, Y15
  3104. VPUNPCKLDQ Y15, Y14, Y12
  3105. VPUNPCKHDQ Y15, Y14, Y13
  3106. VPERMQ $0xd8, Y12, Y12
  3107. VPERMQ $0xd8, Y13, Y13
  3108. VMOVDQU Y12, 512(SP)
  3109. VMOVDQU Y13, 544(SP)
  3110. // Initialize flags
  3111. VPBROADCASTD flags+32(FP), Y14
  3112. VMOVDQU Y14, 576(SP)
  3113. VMOVDQU Y14, 608(SP)
  3114. ORL $0x01, 576(SP)
  3115. ORL $0x02, 636(SP)
  3116. // Loop index
  3117. XORQ DX, DX
  3118. loop:
  3119. // Load transposed block
  3120. VMOVDQU seq<>+0(SB), Y9
  3121. VPSLLD $0x0a, Y9, Y9
  3122. VPCMPEQD Y8, Y8, Y8
  3123. VPGATHERDD Y8, (CX)(Y9*1), Y10
  3124. VMOVDQU Y10, (SP)
  3125. VPCMPEQD Y8, Y8, Y8
  3126. VPGATHERDD Y8, 4(CX)(Y9*1), Y10
  3127. VMOVDQU Y10, 32(SP)
  3128. VPCMPEQD Y8, Y8, Y8
  3129. VPGATHERDD Y8, 8(CX)(Y9*1), Y10
  3130. VMOVDQU Y10, 64(SP)
  3131. VPCMPEQD Y8, Y8, Y8
  3132. VPGATHERDD Y8, 12(CX)(Y9*1), Y10
  3133. VMOVDQU Y10, 96(SP)
  3134. VPCMPEQD Y8, Y8, Y8
  3135. VPGATHERDD Y8, 16(CX)(Y9*1), Y10
  3136. VMOVDQU Y10, 128(SP)
  3137. VPCMPEQD Y8, Y8, Y8
  3138. VPGATHERDD Y8, 20(CX)(Y9*1), Y10
  3139. VMOVDQU Y10, 160(SP)
  3140. VPCMPEQD Y8, Y8, Y8
  3141. VPGATHERDD Y8, 24(CX)(Y9*1), Y10
  3142. VMOVDQU Y10, 192(SP)
  3143. VPCMPEQD Y8, Y8, Y8
  3144. VPGATHERDD Y8, 28(CX)(Y9*1), Y10
  3145. VMOVDQU Y10, 224(SP)
  3146. VPCMPEQD Y8, Y8, Y8
  3147. VPGATHERDD Y8, 32(CX)(Y9*1), Y10
  3148. VMOVDQU Y10, 256(SP)
  3149. VPCMPEQD Y8, Y8, Y8
  3150. VPGATHERDD Y8, 36(CX)(Y9*1), Y10
  3151. VMOVDQU Y10, 288(SP)
  3152. VPCMPEQD Y8, Y8, Y8
  3153. VPGATHERDD Y8, 40(CX)(Y9*1), Y10
  3154. VMOVDQU Y10, 320(SP)
  3155. VPCMPEQD Y8, Y8, Y8
  3156. VPGATHERDD Y8, 44(CX)(Y9*1), Y10
  3157. VMOVDQU Y10, 352(SP)
  3158. VPCMPEQD Y8, Y8, Y8
  3159. VPGATHERDD Y8, 48(CX)(Y9*1), Y10
  3160. VMOVDQU Y10, 384(SP)
  3161. VPCMPEQD Y8, Y8, Y8
  3162. VPGATHERDD Y8, 52(CX)(Y9*1), Y10
  3163. VMOVDQU Y10, 416(SP)
  3164. VPCMPEQD Y8, Y8, Y8
  3165. VPGATHERDD Y8, 56(CX)(Y9*1), Y10
  3166. VMOVDQU Y10, 448(SP)
  3167. VPCMPEQD Y8, Y8, Y8
  3168. VPGATHERDD Y8, 60(CX)(Y9*1), Y10
  3169. VMOVDQU Y10, 480(SP)
  3170. ADDQ $0x40, CX
  3171. // Reload state vectors (other than CVs)
  3172. VPBROADCASTD iv<>+0(SB), Y8
  3173. VPBROADCASTD iv<>+4(SB), Y9
  3174. VPBROADCASTD iv<>+8(SB), Y10
  3175. VPBROADCASTD iv<>+12(SB), Y11
  3176. VMOVDQU 512(SP), Y12
  3177. VMOVDQU 544(SP), Y13
  3178. VPBROADCASTD seq<>+4(SB), Y14
  3179. VPSLLD $0x06, Y14, Y14
  3180. VPBROADCASTD 576(SP)(DX*4), Y15
  3181. VMOVDQU Y8, 640(SP)
  3182. // Round 1
  3183. VPADDD Y0, Y4, Y0
  3184. VPADDD (SP), Y0, Y0
  3185. VPXOR Y12, Y0, Y12
  3186. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3187. VMOVDQU 640(SP), Y8
  3188. VPADDD Y8, Y12, Y8
  3189. VPXOR Y4, Y8, Y4
  3190. VMOVDQU Y8, 640(SP)
  3191. VPSRLD $0x0c, Y4, Y8
  3192. VPSLLD $0x14, Y4, Y4
  3193. VPOR Y4, Y8, Y4
  3194. VPADDD Y0, Y4, Y0
  3195. VPADDD 32(SP), Y0, Y0
  3196. VPXOR Y12, Y0, Y12
  3197. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3198. VMOVDQU 640(SP), Y8
  3199. VPADDD Y8, Y12, Y8
  3200. VPXOR Y4, Y8, Y4
  3201. VMOVDQU Y8, 640(SP)
  3202. VPSRLD $0x07, Y4, Y8
  3203. VPSLLD $0x19, Y4, Y4
  3204. VPOR Y4, Y8, Y4
  3205. VPADDD Y1, Y5, Y1
  3206. VPADDD 64(SP), Y1, Y1
  3207. VPXOR Y13, Y1, Y13
  3208. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3209. VPADDD Y9, Y13, Y9
  3210. VPXOR Y5, Y9, Y5
  3211. VPSRLD $0x0c, Y5, Y8
  3212. VPSLLD $0x14, Y5, Y5
  3213. VPOR Y5, Y8, Y5
  3214. VPADDD Y1, Y5, Y1
  3215. VPADDD 96(SP), Y1, Y1
  3216. VPXOR Y13, Y1, Y13
  3217. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3218. VPADDD Y9, Y13, Y9
  3219. VPXOR Y5, Y9, Y5
  3220. VPSRLD $0x07, Y5, Y8
  3221. VPSLLD $0x19, Y5, Y5
  3222. VPOR Y5, Y8, Y5
  3223. VPADDD Y2, Y6, Y2
  3224. VPADDD 128(SP), Y2, Y2
  3225. VPXOR Y14, Y2, Y14
  3226. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3227. VPADDD Y10, Y14, Y10
  3228. VPXOR Y6, Y10, Y6
  3229. VPSRLD $0x0c, Y6, Y8
  3230. VPSLLD $0x14, Y6, Y6
  3231. VPOR Y6, Y8, Y6
  3232. VPADDD Y2, Y6, Y2
  3233. VPADDD 160(SP), Y2, Y2
  3234. VPXOR Y14, Y2, Y14
  3235. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3236. VPADDD Y10, Y14, Y10
  3237. VPXOR Y6, Y10, Y6
  3238. VPSRLD $0x07, Y6, Y8
  3239. VPSLLD $0x19, Y6, Y6
  3240. VPOR Y6, Y8, Y6
  3241. VPADDD Y3, Y7, Y3
  3242. VPADDD 192(SP), Y3, Y3
  3243. VPXOR Y15, Y3, Y15
  3244. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3245. VPADDD Y11, Y15, Y11
  3246. VPXOR Y7, Y11, Y7
  3247. VPSRLD $0x0c, Y7, Y8
  3248. VPSLLD $0x14, Y7, Y7
  3249. VPOR Y7, Y8, Y7
  3250. VPADDD Y3, Y7, Y3
  3251. VPADDD 224(SP), Y3, Y3
  3252. VPXOR Y15, Y3, Y15
  3253. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3254. VPADDD Y11, Y15, Y11
  3255. VPXOR Y7, Y11, Y7
  3256. VPSRLD $0x07, Y7, Y8
  3257. VPSLLD $0x19, Y7, Y7
  3258. VPOR Y7, Y8, Y7
  3259. VPADDD Y0, Y5, Y0
  3260. VPADDD 256(SP), Y0, Y0
  3261. VPXOR Y15, Y0, Y15
  3262. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3263. VPADDD Y10, Y15, Y10
  3264. VPXOR Y5, Y10, Y5
  3265. VPSRLD $0x0c, Y5, Y8
  3266. VPSLLD $0x14, Y5, Y5
  3267. VPOR Y5, Y8, Y5
  3268. VPADDD Y0, Y5, Y0
  3269. VPADDD 288(SP), Y0, Y0
  3270. VPXOR Y15, Y0, Y15
  3271. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3272. VPADDD Y10, Y15, Y10
  3273. VPXOR Y5, Y10, Y5
  3274. VPSRLD $0x07, Y5, Y8
  3275. VPSLLD $0x19, Y5, Y5
  3276. VPOR Y5, Y8, Y5
  3277. VPADDD Y1, Y6, Y1
  3278. VPADDD 320(SP), Y1, Y1
  3279. VPXOR Y12, Y1, Y12
  3280. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3281. VPADDD Y11, Y12, Y11
  3282. VPXOR Y6, Y11, Y6
  3283. VPSRLD $0x0c, Y6, Y8
  3284. VPSLLD $0x14, Y6, Y6
  3285. VPOR Y6, Y8, Y6
  3286. VPADDD Y1, Y6, Y1
  3287. VPADDD 352(SP), Y1, Y1
  3288. VPXOR Y12, Y1, Y12
  3289. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3290. VPADDD Y11, Y12, Y11
  3291. VPXOR Y6, Y11, Y6
  3292. VPSRLD $0x07, Y6, Y8
  3293. VPSLLD $0x19, Y6, Y6
  3294. VPOR Y6, Y8, Y6
  3295. VPADDD Y2, Y7, Y2
  3296. VPADDD 384(SP), Y2, Y2
  3297. VPXOR Y13, Y2, Y13
  3298. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3299. VMOVDQU 640(SP), Y8
  3300. VPADDD Y8, Y13, Y8
  3301. VPXOR Y7, Y8, Y7
  3302. VMOVDQU Y8, 640(SP)
  3303. VPSRLD $0x0c, Y7, Y8
  3304. VPSLLD $0x14, Y7, Y7
  3305. VPOR Y7, Y8, Y7
  3306. VPADDD Y2, Y7, Y2
  3307. VPADDD 416(SP), Y2, Y2
  3308. VPXOR Y13, Y2, Y13
  3309. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3310. VMOVDQU 640(SP), Y8
  3311. VPADDD Y8, Y13, Y8
  3312. VPXOR Y7, Y8, Y7
  3313. VMOVDQU Y8, 640(SP)
  3314. VPSRLD $0x07, Y7, Y8
  3315. VPSLLD $0x19, Y7, Y7
  3316. VPOR Y7, Y8, Y7
  3317. VPADDD Y3, Y4, Y3
  3318. VPADDD 448(SP), Y3, Y3
  3319. VPXOR Y14, Y3, Y14
  3320. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3321. VPADDD Y9, Y14, Y9
  3322. VPXOR Y4, Y9, Y4
  3323. VPSRLD $0x0c, Y4, Y8
  3324. VPSLLD $0x14, Y4, Y4
  3325. VPOR Y4, Y8, Y4
  3326. VPADDD Y3, Y4, Y3
  3327. VPADDD 480(SP), Y3, Y3
  3328. VPXOR Y14, Y3, Y14
  3329. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3330. VPADDD Y9, Y14, Y9
  3331. VPXOR Y4, Y9, Y4
  3332. VPSRLD $0x07, Y4, Y8
  3333. VPSLLD $0x19, Y4, Y4
  3334. VPOR Y4, Y8, Y4
  3335. // Round 2
  3336. VPADDD Y0, Y4, Y0
  3337. VPADDD 64(SP), Y0, Y0
  3338. VPXOR Y12, Y0, Y12
  3339. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3340. VMOVDQU 640(SP), Y8
  3341. VPADDD Y8, Y12, Y8
  3342. VPXOR Y4, Y8, Y4
  3343. VMOVDQU Y8, 640(SP)
  3344. VPSRLD $0x0c, Y4, Y8
  3345. VPSLLD $0x14, Y4, Y4
  3346. VPOR Y4, Y8, Y4
  3347. VPADDD Y0, Y4, Y0
  3348. VPADDD 192(SP), Y0, Y0
  3349. VPXOR Y12, Y0, Y12
  3350. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3351. VMOVDQU 640(SP), Y8
  3352. VPADDD Y8, Y12, Y8
  3353. VPXOR Y4, Y8, Y4
  3354. VMOVDQU Y8, 640(SP)
  3355. VPSRLD $0x07, Y4, Y8
  3356. VPSLLD $0x19, Y4, Y4
  3357. VPOR Y4, Y8, Y4
  3358. VPADDD Y1, Y5, Y1
  3359. VPADDD 96(SP), Y1, Y1
  3360. VPXOR Y13, Y1, Y13
  3361. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3362. VPADDD Y9, Y13, Y9
  3363. VPXOR Y5, Y9, Y5
  3364. VPSRLD $0x0c, Y5, Y8
  3365. VPSLLD $0x14, Y5, Y5
  3366. VPOR Y5, Y8, Y5
  3367. VPADDD Y1, Y5, Y1
  3368. VPADDD 320(SP), Y1, Y1
  3369. VPXOR Y13, Y1, Y13
  3370. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3371. VPADDD Y9, Y13, Y9
  3372. VPXOR Y5, Y9, Y5
  3373. VPSRLD $0x07, Y5, Y8
  3374. VPSLLD $0x19, Y5, Y5
  3375. VPOR Y5, Y8, Y5
  3376. VPADDD Y2, Y6, Y2
  3377. VPADDD 224(SP), Y2, Y2
  3378. VPXOR Y14, Y2, Y14
  3379. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3380. VPADDD Y10, Y14, Y10
  3381. VPXOR Y6, Y10, Y6
  3382. VPSRLD $0x0c, Y6, Y8
  3383. VPSLLD $0x14, Y6, Y6
  3384. VPOR Y6, Y8, Y6
  3385. VPADDD Y2, Y6, Y2
  3386. VPADDD (SP), Y2, Y2
  3387. VPXOR Y14, Y2, Y14
  3388. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3389. VPADDD Y10, Y14, Y10
  3390. VPXOR Y6, Y10, Y6
  3391. VPSRLD $0x07, Y6, Y8
  3392. VPSLLD $0x19, Y6, Y6
  3393. VPOR Y6, Y8, Y6
  3394. VPADDD Y3, Y7, Y3
  3395. VPADDD 128(SP), Y3, Y3
  3396. VPXOR Y15, Y3, Y15
  3397. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3398. VPADDD Y11, Y15, Y11
  3399. VPXOR Y7, Y11, Y7
  3400. VPSRLD $0x0c, Y7, Y8
  3401. VPSLLD $0x14, Y7, Y7
  3402. VPOR Y7, Y8, Y7
  3403. VPADDD Y3, Y7, Y3
  3404. VPADDD 416(SP), Y3, Y3
  3405. VPXOR Y15, Y3, Y15
  3406. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3407. VPADDD Y11, Y15, Y11
  3408. VPXOR Y7, Y11, Y7
  3409. VPSRLD $0x07, Y7, Y8
  3410. VPSLLD $0x19, Y7, Y7
  3411. VPOR Y7, Y8, Y7
  3412. VPADDD Y0, Y5, Y0
  3413. VPADDD 32(SP), Y0, Y0
  3414. VPXOR Y15, Y0, Y15
  3415. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3416. VPADDD Y10, Y15, Y10
  3417. VPXOR Y5, Y10, Y5
  3418. VPSRLD $0x0c, Y5, Y8
  3419. VPSLLD $0x14, Y5, Y5
  3420. VPOR Y5, Y8, Y5
  3421. VPADDD Y0, Y5, Y0
  3422. VPADDD 352(SP), Y0, Y0
  3423. VPXOR Y15, Y0, Y15
  3424. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3425. VPADDD Y10, Y15, Y10
  3426. VPXOR Y5, Y10, Y5
  3427. VPSRLD $0x07, Y5, Y8
  3428. VPSLLD $0x19, Y5, Y5
  3429. VPOR Y5, Y8, Y5
  3430. VPADDD Y1, Y6, Y1
  3431. VPADDD 384(SP), Y1, Y1
  3432. VPXOR Y12, Y1, Y12
  3433. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3434. VPADDD Y11, Y12, Y11
  3435. VPXOR Y6, Y11, Y6
  3436. VPSRLD $0x0c, Y6, Y8
  3437. VPSLLD $0x14, Y6, Y6
  3438. VPOR Y6, Y8, Y6
  3439. VPADDD Y1, Y6, Y1
  3440. VPADDD 160(SP), Y1, Y1
  3441. VPXOR Y12, Y1, Y12
  3442. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3443. VPADDD Y11, Y12, Y11
  3444. VPXOR Y6, Y11, Y6
  3445. VPSRLD $0x07, Y6, Y8
  3446. VPSLLD $0x19, Y6, Y6
  3447. VPOR Y6, Y8, Y6
  3448. VPADDD Y2, Y7, Y2
  3449. VPADDD 288(SP), Y2, Y2
  3450. VPXOR Y13, Y2, Y13
  3451. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3452. VMOVDQU 640(SP), Y8
  3453. VPADDD Y8, Y13, Y8
  3454. VPXOR Y7, Y8, Y7
  3455. VMOVDQU Y8, 640(SP)
  3456. VPSRLD $0x0c, Y7, Y8
  3457. VPSLLD $0x14, Y7, Y7
  3458. VPOR Y7, Y8, Y7
  3459. VPADDD Y2, Y7, Y2
  3460. VPADDD 448(SP), Y2, Y2
  3461. VPXOR Y13, Y2, Y13
  3462. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3463. VMOVDQU 640(SP), Y8
  3464. VPADDD Y8, Y13, Y8
  3465. VPXOR Y7, Y8, Y7
  3466. VMOVDQU Y8, 640(SP)
  3467. VPSRLD $0x07, Y7, Y8
  3468. VPSLLD $0x19, Y7, Y7
  3469. VPOR Y7, Y8, Y7
  3470. VPADDD Y3, Y4, Y3
  3471. VPADDD 480(SP), Y3, Y3
  3472. VPXOR Y14, Y3, Y14
  3473. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3474. VPADDD Y9, Y14, Y9
  3475. VPXOR Y4, Y9, Y4
  3476. VPSRLD $0x0c, Y4, Y8
  3477. VPSLLD $0x14, Y4, Y4
  3478. VPOR Y4, Y8, Y4
  3479. VPADDD Y3, Y4, Y3
  3480. VPADDD 256(SP), Y3, Y3
  3481. VPXOR Y14, Y3, Y14
  3482. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3483. VPADDD Y9, Y14, Y9
  3484. VPXOR Y4, Y9, Y4
  3485. VPSRLD $0x07, Y4, Y8
  3486. VPSLLD $0x19, Y4, Y4
  3487. VPOR Y4, Y8, Y4
  3488. // Round 3
  3489. VPADDD Y0, Y4, Y0
  3490. VPADDD 96(SP), Y0, Y0
  3491. VPXOR Y12, Y0, Y12
  3492. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3493. VMOVDQU 640(SP), Y8
  3494. VPADDD Y8, Y12, Y8
  3495. VPXOR Y4, Y8, Y4
  3496. VMOVDQU Y8, 640(SP)
  3497. VPSRLD $0x0c, Y4, Y8
  3498. VPSLLD $0x14, Y4, Y4
  3499. VPOR Y4, Y8, Y4
  3500. VPADDD Y0, Y4, Y0
  3501. VPADDD 128(SP), Y0, Y0
  3502. VPXOR Y12, Y0, Y12
  3503. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3504. VMOVDQU 640(SP), Y8
  3505. VPADDD Y8, Y12, Y8
  3506. VPXOR Y4, Y8, Y4
  3507. VMOVDQU Y8, 640(SP)
  3508. VPSRLD $0x07, Y4, Y8
  3509. VPSLLD $0x19, Y4, Y4
  3510. VPOR Y4, Y8, Y4
  3511. VPADDD Y1, Y5, Y1
  3512. VPADDD 320(SP), Y1, Y1
  3513. VPXOR Y13, Y1, Y13
  3514. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3515. VPADDD Y9, Y13, Y9
  3516. VPXOR Y5, Y9, Y5
  3517. VPSRLD $0x0c, Y5, Y8
  3518. VPSLLD $0x14, Y5, Y5
  3519. VPOR Y5, Y8, Y5
  3520. VPADDD Y1, Y5, Y1
  3521. VPADDD 384(SP), Y1, Y1
  3522. VPXOR Y13, Y1, Y13
  3523. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3524. VPADDD Y9, Y13, Y9
  3525. VPXOR Y5, Y9, Y5
  3526. VPSRLD $0x07, Y5, Y8
  3527. VPSLLD $0x19, Y5, Y5
  3528. VPOR Y5, Y8, Y5
  3529. VPADDD Y2, Y6, Y2
  3530. VPADDD 416(SP), Y2, Y2
  3531. VPXOR Y14, Y2, Y14
  3532. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3533. VPADDD Y10, Y14, Y10
  3534. VPXOR Y6, Y10, Y6
  3535. VPSRLD $0x0c, Y6, Y8
  3536. VPSLLD $0x14, Y6, Y6
  3537. VPOR Y6, Y8, Y6
  3538. VPADDD Y2, Y6, Y2
  3539. VPADDD 64(SP), Y2, Y2
  3540. VPXOR Y14, Y2, Y14
  3541. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3542. VPADDD Y10, Y14, Y10
  3543. VPXOR Y6, Y10, Y6
  3544. VPSRLD $0x07, Y6, Y8
  3545. VPSLLD $0x19, Y6, Y6
  3546. VPOR Y6, Y8, Y6
  3547. VPADDD Y3, Y7, Y3
  3548. VPADDD 224(SP), Y3, Y3
  3549. VPXOR Y15, Y3, Y15
  3550. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3551. VPADDD Y11, Y15, Y11
  3552. VPXOR Y7, Y11, Y7
  3553. VPSRLD $0x0c, Y7, Y8
  3554. VPSLLD $0x14, Y7, Y7
  3555. VPOR Y7, Y8, Y7
  3556. VPADDD Y3, Y7, Y3
  3557. VPADDD 448(SP), Y3, Y3
  3558. VPXOR Y15, Y3, Y15
  3559. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3560. VPADDD Y11, Y15, Y11
  3561. VPXOR Y7, Y11, Y7
  3562. VPSRLD $0x07, Y7, Y8
  3563. VPSLLD $0x19, Y7, Y7
  3564. VPOR Y7, Y8, Y7
  3565. VPADDD Y0, Y5, Y0
  3566. VPADDD 192(SP), Y0, Y0
  3567. VPXOR Y15, Y0, Y15
  3568. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3569. VPADDD Y10, Y15, Y10
  3570. VPXOR Y5, Y10, Y5
  3571. VPSRLD $0x0c, Y5, Y8
  3572. VPSLLD $0x14, Y5, Y5
  3573. VPOR Y5, Y8, Y5
  3574. VPADDD Y0, Y5, Y0
  3575. VPADDD 160(SP), Y0, Y0
  3576. VPXOR Y15, Y0, Y15
  3577. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3578. VPADDD Y10, Y15, Y10
  3579. VPXOR Y5, Y10, Y5
  3580. VPSRLD $0x07, Y5, Y8
  3581. VPSLLD $0x19, Y5, Y5
  3582. VPOR Y5, Y8, Y5
  3583. VPADDD Y1, Y6, Y1
  3584. VPADDD 288(SP), Y1, Y1
  3585. VPXOR Y12, Y1, Y12
  3586. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3587. VPADDD Y11, Y12, Y11
  3588. VPXOR Y6, Y11, Y6
  3589. VPSRLD $0x0c, Y6, Y8
  3590. VPSLLD $0x14, Y6, Y6
  3591. VPOR Y6, Y8, Y6
  3592. VPADDD Y1, Y6, Y1
  3593. VPADDD (SP), Y1, Y1
  3594. VPXOR Y12, Y1, Y12
  3595. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3596. VPADDD Y11, Y12, Y11
  3597. VPXOR Y6, Y11, Y6
  3598. VPSRLD $0x07, Y6, Y8
  3599. VPSLLD $0x19, Y6, Y6
  3600. VPOR Y6, Y8, Y6
  3601. VPADDD Y2, Y7, Y2
  3602. VPADDD 352(SP), Y2, Y2
  3603. VPXOR Y13, Y2, Y13
  3604. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3605. VMOVDQU 640(SP), Y8
  3606. VPADDD Y8, Y13, Y8
  3607. VPXOR Y7, Y8, Y7
  3608. VMOVDQU Y8, 640(SP)
  3609. VPSRLD $0x0c, Y7, Y8
  3610. VPSLLD $0x14, Y7, Y7
  3611. VPOR Y7, Y8, Y7
  3612. VPADDD Y2, Y7, Y2
  3613. VPADDD 480(SP), Y2, Y2
  3614. VPXOR Y13, Y2, Y13
  3615. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3616. VMOVDQU 640(SP), Y8
  3617. VPADDD Y8, Y13, Y8
  3618. VPXOR Y7, Y8, Y7
  3619. VMOVDQU Y8, 640(SP)
  3620. VPSRLD $0x07, Y7, Y8
  3621. VPSLLD $0x19, Y7, Y7
  3622. VPOR Y7, Y8, Y7
  3623. VPADDD Y3, Y4, Y3
  3624. VPADDD 256(SP), Y3, Y3
  3625. VPXOR Y14, Y3, Y14
  3626. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3627. VPADDD Y9, Y14, Y9
  3628. VPXOR Y4, Y9, Y4
  3629. VPSRLD $0x0c, Y4, Y8
  3630. VPSLLD $0x14, Y4, Y4
  3631. VPOR Y4, Y8, Y4
  3632. VPADDD Y3, Y4, Y3
  3633. VPADDD 32(SP), Y3, Y3
  3634. VPXOR Y14, Y3, Y14
  3635. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3636. VPADDD Y9, Y14, Y9
  3637. VPXOR Y4, Y9, Y4
  3638. VPSRLD $0x07, Y4, Y8
  3639. VPSLLD $0x19, Y4, Y4
  3640. VPOR Y4, Y8, Y4
  3641. // Round 4
  3642. VPADDD Y0, Y4, Y0
  3643. VPADDD 320(SP), Y0, Y0
  3644. VPXOR Y12, Y0, Y12
  3645. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3646. VMOVDQU 640(SP), Y8
  3647. VPADDD Y8, Y12, Y8
  3648. VPXOR Y4, Y8, Y4
  3649. VMOVDQU Y8, 640(SP)
  3650. VPSRLD $0x0c, Y4, Y8
  3651. VPSLLD $0x14, Y4, Y4
  3652. VPOR Y4, Y8, Y4
  3653. VPADDD Y0, Y4, Y0
  3654. VPADDD 224(SP), Y0, Y0
  3655. VPXOR Y12, Y0, Y12
  3656. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3657. VMOVDQU 640(SP), Y8
  3658. VPADDD Y8, Y12, Y8
  3659. VPXOR Y4, Y8, Y4
  3660. VMOVDQU Y8, 640(SP)
  3661. VPSRLD $0x07, Y4, Y8
  3662. VPSLLD $0x19, Y4, Y4
  3663. VPOR Y4, Y8, Y4
  3664. VPADDD Y1, Y5, Y1
  3665. VPADDD 384(SP), Y1, Y1
  3666. VPXOR Y13, Y1, Y13
  3667. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3668. VPADDD Y9, Y13, Y9
  3669. VPXOR Y5, Y9, Y5
  3670. VPSRLD $0x0c, Y5, Y8
  3671. VPSLLD $0x14, Y5, Y5
  3672. VPOR Y5, Y8, Y5
  3673. VPADDD Y1, Y5, Y1
  3674. VPADDD 288(SP), Y1, Y1
  3675. VPXOR Y13, Y1, Y13
  3676. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3677. VPADDD Y9, Y13, Y9
  3678. VPXOR Y5, Y9, Y5
  3679. VPSRLD $0x07, Y5, Y8
  3680. VPSLLD $0x19, Y5, Y5
  3681. VPOR Y5, Y8, Y5
  3682. VPADDD Y2, Y6, Y2
  3683. VPADDD 448(SP), Y2, Y2
  3684. VPXOR Y14, Y2, Y14
  3685. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3686. VPADDD Y10, Y14, Y10
  3687. VPXOR Y6, Y10, Y6
  3688. VPSRLD $0x0c, Y6, Y8
  3689. VPSLLD $0x14, Y6, Y6
  3690. VPOR Y6, Y8, Y6
  3691. VPADDD Y2, Y6, Y2
  3692. VPADDD 96(SP), Y2, Y2
  3693. VPXOR Y14, Y2, Y14
  3694. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3695. VPADDD Y10, Y14, Y10
  3696. VPXOR Y6, Y10, Y6
  3697. VPSRLD $0x07, Y6, Y8
  3698. VPSLLD $0x19, Y6, Y6
  3699. VPOR Y6, Y8, Y6
  3700. VPADDD Y3, Y7, Y3
  3701. VPADDD 416(SP), Y3, Y3
  3702. VPXOR Y15, Y3, Y15
  3703. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3704. VPADDD Y11, Y15, Y11
  3705. VPXOR Y7, Y11, Y7
  3706. VPSRLD $0x0c, Y7, Y8
  3707. VPSLLD $0x14, Y7, Y7
  3708. VPOR Y7, Y8, Y7
  3709. VPADDD Y3, Y7, Y3
  3710. VPADDD 480(SP), Y3, Y3
  3711. VPXOR Y15, Y3, Y15
  3712. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3713. VPADDD Y11, Y15, Y11
  3714. VPXOR Y7, Y11, Y7
  3715. VPSRLD $0x07, Y7, Y8
  3716. VPSLLD $0x19, Y7, Y7
  3717. VPOR Y7, Y8, Y7
  3718. VPADDD Y0, Y5, Y0
  3719. VPADDD 128(SP), Y0, Y0
  3720. VPXOR Y15, Y0, Y15
  3721. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3722. VPADDD Y10, Y15, Y10
  3723. VPXOR Y5, Y10, Y5
  3724. VPSRLD $0x0c, Y5, Y8
  3725. VPSLLD $0x14, Y5, Y5
  3726. VPOR Y5, Y8, Y5
  3727. VPADDD Y0, Y5, Y0
  3728. VPADDD (SP), Y0, Y0
  3729. VPXOR Y15, Y0, Y15
  3730. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3731. VPADDD Y10, Y15, Y10
  3732. VPXOR Y5, Y10, Y5
  3733. VPSRLD $0x07, Y5, Y8
  3734. VPSLLD $0x19, Y5, Y5
  3735. VPOR Y5, Y8, Y5
  3736. VPADDD Y1, Y6, Y1
  3737. VPADDD 352(SP), Y1, Y1
  3738. VPXOR Y12, Y1, Y12
  3739. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3740. VPADDD Y11, Y12, Y11
  3741. VPXOR Y6, Y11, Y6
  3742. VPSRLD $0x0c, Y6, Y8
  3743. VPSLLD $0x14, Y6, Y6
  3744. VPOR Y6, Y8, Y6
  3745. VPADDD Y1, Y6, Y1
  3746. VPADDD 64(SP), Y1, Y1
  3747. VPXOR Y12, Y1, Y12
  3748. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3749. VPADDD Y11, Y12, Y11
  3750. VPXOR Y6, Y11, Y6
  3751. VPSRLD $0x07, Y6, Y8
  3752. VPSLLD $0x19, Y6, Y6
  3753. VPOR Y6, Y8, Y6
  3754. VPADDD Y2, Y7, Y2
  3755. VPADDD 160(SP), Y2, Y2
  3756. VPXOR Y13, Y2, Y13
  3757. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3758. VMOVDQU 640(SP), Y8
  3759. VPADDD Y8, Y13, Y8
  3760. VPXOR Y7, Y8, Y7
  3761. VMOVDQU Y8, 640(SP)
  3762. VPSRLD $0x0c, Y7, Y8
  3763. VPSLLD $0x14, Y7, Y7
  3764. VPOR Y7, Y8, Y7
  3765. VPADDD Y2, Y7, Y2
  3766. VPADDD 256(SP), Y2, Y2
  3767. VPXOR Y13, Y2, Y13
  3768. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3769. VMOVDQU 640(SP), Y8
  3770. VPADDD Y8, Y13, Y8
  3771. VPXOR Y7, Y8, Y7
  3772. VMOVDQU Y8, 640(SP)
  3773. VPSRLD $0x07, Y7, Y8
  3774. VPSLLD $0x19, Y7, Y7
  3775. VPOR Y7, Y8, Y7
  3776. VPADDD Y3, Y4, Y3
  3777. VPADDD 32(SP), Y3, Y3
  3778. VPXOR Y14, Y3, Y14
  3779. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3780. VPADDD Y9, Y14, Y9
  3781. VPXOR Y4, Y9, Y4
  3782. VPSRLD $0x0c, Y4, Y8
  3783. VPSLLD $0x14, Y4, Y4
  3784. VPOR Y4, Y8, Y4
  3785. VPADDD Y3, Y4, Y3
  3786. VPADDD 192(SP), Y3, Y3
  3787. VPXOR Y14, Y3, Y14
  3788. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3789. VPADDD Y9, Y14, Y9
  3790. VPXOR Y4, Y9, Y4
  3791. VPSRLD $0x07, Y4, Y8
  3792. VPSLLD $0x19, Y4, Y4
  3793. VPOR Y4, Y8, Y4
  3794. // Round 5
  3795. VPADDD Y0, Y4, Y0
  3796. VPADDD 384(SP), Y0, Y0
  3797. VPXOR Y12, Y0, Y12
  3798. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3799. VMOVDQU 640(SP), Y8
  3800. VPADDD Y8, Y12, Y8
  3801. VPXOR Y4, Y8, Y4
  3802. VMOVDQU Y8, 640(SP)
  3803. VPSRLD $0x0c, Y4, Y8
  3804. VPSLLD $0x14, Y4, Y4
  3805. VPOR Y4, Y8, Y4
  3806. VPADDD Y0, Y4, Y0
  3807. VPADDD 416(SP), Y0, Y0
  3808. VPXOR Y12, Y0, Y12
  3809. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3810. VMOVDQU 640(SP), Y8
  3811. VPADDD Y8, Y12, Y8
  3812. VPXOR Y4, Y8, Y4
  3813. VMOVDQU Y8, 640(SP)
  3814. VPSRLD $0x07, Y4, Y8
  3815. VPSLLD $0x19, Y4, Y4
  3816. VPOR Y4, Y8, Y4
  3817. VPADDD Y1, Y5, Y1
  3818. VPADDD 288(SP), Y1, Y1
  3819. VPXOR Y13, Y1, Y13
  3820. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3821. VPADDD Y9, Y13, Y9
  3822. VPXOR Y5, Y9, Y5
  3823. VPSRLD $0x0c, Y5, Y8
  3824. VPSLLD $0x14, Y5, Y5
  3825. VPOR Y5, Y8, Y5
  3826. VPADDD Y1, Y5, Y1
  3827. VPADDD 352(SP), Y1, Y1
  3828. VPXOR Y13, Y1, Y13
  3829. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3830. VPADDD Y9, Y13, Y9
  3831. VPXOR Y5, Y9, Y5
  3832. VPSRLD $0x07, Y5, Y8
  3833. VPSLLD $0x19, Y5, Y5
  3834. VPOR Y5, Y8, Y5
  3835. VPADDD Y2, Y6, Y2
  3836. VPADDD 480(SP), Y2, Y2
  3837. VPXOR Y14, Y2, Y14
  3838. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3839. VPADDD Y10, Y14, Y10
  3840. VPXOR Y6, Y10, Y6
  3841. VPSRLD $0x0c, Y6, Y8
  3842. VPSLLD $0x14, Y6, Y6
  3843. VPOR Y6, Y8, Y6
  3844. VPADDD Y2, Y6, Y2
  3845. VPADDD 320(SP), Y2, Y2
  3846. VPXOR Y14, Y2, Y14
  3847. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3848. VPADDD Y10, Y14, Y10
  3849. VPXOR Y6, Y10, Y6
  3850. VPSRLD $0x07, Y6, Y8
  3851. VPSLLD $0x19, Y6, Y6
  3852. VPOR Y6, Y8, Y6
  3853. VPADDD Y3, Y7, Y3
  3854. VPADDD 448(SP), Y3, Y3
  3855. VPXOR Y15, Y3, Y15
  3856. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3857. VPADDD Y11, Y15, Y11
  3858. VPXOR Y7, Y11, Y7
  3859. VPSRLD $0x0c, Y7, Y8
  3860. VPSLLD $0x14, Y7, Y7
  3861. VPOR Y7, Y8, Y7
  3862. VPADDD Y3, Y7, Y3
  3863. VPADDD 256(SP), Y3, Y3
  3864. VPXOR Y15, Y3, Y15
  3865. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3866. VPADDD Y11, Y15, Y11
  3867. VPXOR Y7, Y11, Y7
  3868. VPSRLD $0x07, Y7, Y8
  3869. VPSLLD $0x19, Y7, Y7
  3870. VPOR Y7, Y8, Y7
  3871. VPADDD Y0, Y5, Y0
  3872. VPADDD 224(SP), Y0, Y0
  3873. VPXOR Y15, Y0, Y15
  3874. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  3875. VPADDD Y10, Y15, Y10
  3876. VPXOR Y5, Y10, Y5
  3877. VPSRLD $0x0c, Y5, Y8
  3878. VPSLLD $0x14, Y5, Y5
  3879. VPOR Y5, Y8, Y5
  3880. VPADDD Y0, Y5, Y0
  3881. VPADDD 64(SP), Y0, Y0
  3882. VPXOR Y15, Y0, Y15
  3883. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  3884. VPADDD Y10, Y15, Y10
  3885. VPXOR Y5, Y10, Y5
  3886. VPSRLD $0x07, Y5, Y8
  3887. VPSLLD $0x19, Y5, Y5
  3888. VPOR Y5, Y8, Y5
  3889. VPADDD Y1, Y6, Y1
  3890. VPADDD 160(SP), Y1, Y1
  3891. VPXOR Y12, Y1, Y12
  3892. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3893. VPADDD Y11, Y12, Y11
  3894. VPXOR Y6, Y11, Y6
  3895. VPSRLD $0x0c, Y6, Y8
  3896. VPSLLD $0x14, Y6, Y6
  3897. VPOR Y6, Y8, Y6
  3898. VPADDD Y1, Y6, Y1
  3899. VPADDD 96(SP), Y1, Y1
  3900. VPXOR Y12, Y1, Y12
  3901. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3902. VPADDD Y11, Y12, Y11
  3903. VPXOR Y6, Y11, Y6
  3904. VPSRLD $0x07, Y6, Y8
  3905. VPSLLD $0x19, Y6, Y6
  3906. VPOR Y6, Y8, Y6
  3907. VPADDD Y2, Y7, Y2
  3908. VPADDD (SP), Y2, Y2
  3909. VPXOR Y13, Y2, Y13
  3910. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3911. VMOVDQU 640(SP), Y8
  3912. VPADDD Y8, Y13, Y8
  3913. VPXOR Y7, Y8, Y7
  3914. VMOVDQU Y8, 640(SP)
  3915. VPSRLD $0x0c, Y7, Y8
  3916. VPSLLD $0x14, Y7, Y7
  3917. VPOR Y7, Y8, Y7
  3918. VPADDD Y2, Y7, Y2
  3919. VPADDD 32(SP), Y2, Y2
  3920. VPXOR Y13, Y2, Y13
  3921. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3922. VMOVDQU 640(SP), Y8
  3923. VPADDD Y8, Y13, Y8
  3924. VPXOR Y7, Y8, Y7
  3925. VMOVDQU Y8, 640(SP)
  3926. VPSRLD $0x07, Y7, Y8
  3927. VPSLLD $0x19, Y7, Y7
  3928. VPOR Y7, Y8, Y7
  3929. VPADDD Y3, Y4, Y3
  3930. VPADDD 192(SP), Y3, Y3
  3931. VPXOR Y14, Y3, Y14
  3932. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3933. VPADDD Y9, Y14, Y9
  3934. VPXOR Y4, Y9, Y4
  3935. VPSRLD $0x0c, Y4, Y8
  3936. VPSLLD $0x14, Y4, Y4
  3937. VPOR Y4, Y8, Y4
  3938. VPADDD Y3, Y4, Y3
  3939. VPADDD 128(SP), Y3, Y3
  3940. VPXOR Y14, Y3, Y14
  3941. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  3942. VPADDD Y9, Y14, Y9
  3943. VPXOR Y4, Y9, Y4
  3944. VPSRLD $0x07, Y4, Y8
  3945. VPSLLD $0x19, Y4, Y4
  3946. VPOR Y4, Y8, Y4
  3947. // Round 6
  3948. VPADDD Y0, Y4, Y0
  3949. VPADDD 288(SP), Y0, Y0
  3950. VPXOR Y12, Y0, Y12
  3951. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  3952. VMOVDQU 640(SP), Y8
  3953. VPADDD Y8, Y12, Y8
  3954. VPXOR Y4, Y8, Y4
  3955. VMOVDQU Y8, 640(SP)
  3956. VPSRLD $0x0c, Y4, Y8
  3957. VPSLLD $0x14, Y4, Y4
  3958. VPOR Y4, Y8, Y4
  3959. VPADDD Y0, Y4, Y0
  3960. VPADDD 448(SP), Y0, Y0
  3961. VPXOR Y12, Y0, Y12
  3962. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  3963. VMOVDQU 640(SP), Y8
  3964. VPADDD Y8, Y12, Y8
  3965. VPXOR Y4, Y8, Y4
  3966. VMOVDQU Y8, 640(SP)
  3967. VPSRLD $0x07, Y4, Y8
  3968. VPSLLD $0x19, Y4, Y4
  3969. VPOR Y4, Y8, Y4
  3970. VPADDD Y1, Y5, Y1
  3971. VPADDD 352(SP), Y1, Y1
  3972. VPXOR Y13, Y1, Y13
  3973. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  3974. VPADDD Y9, Y13, Y9
  3975. VPXOR Y5, Y9, Y5
  3976. VPSRLD $0x0c, Y5, Y8
  3977. VPSLLD $0x14, Y5, Y5
  3978. VPOR Y5, Y8, Y5
  3979. VPADDD Y1, Y5, Y1
  3980. VPADDD 160(SP), Y1, Y1
  3981. VPXOR Y13, Y1, Y13
  3982. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  3983. VPADDD Y9, Y13, Y9
  3984. VPXOR Y5, Y9, Y5
  3985. VPSRLD $0x07, Y5, Y8
  3986. VPSLLD $0x19, Y5, Y5
  3987. VPOR Y5, Y8, Y5
  3988. VPADDD Y2, Y6, Y2
  3989. VPADDD 256(SP), Y2, Y2
  3990. VPXOR Y14, Y2, Y14
  3991. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  3992. VPADDD Y10, Y14, Y10
  3993. VPXOR Y6, Y10, Y6
  3994. VPSRLD $0x0c, Y6, Y8
  3995. VPSLLD $0x14, Y6, Y6
  3996. VPOR Y6, Y8, Y6
  3997. VPADDD Y2, Y6, Y2
  3998. VPADDD 384(SP), Y2, Y2
  3999. VPXOR Y14, Y2, Y14
  4000. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4001. VPADDD Y10, Y14, Y10
  4002. VPXOR Y6, Y10, Y6
  4003. VPSRLD $0x07, Y6, Y8
  4004. VPSLLD $0x19, Y6, Y6
  4005. VPOR Y6, Y8, Y6
  4006. VPADDD Y3, Y7, Y3
  4007. VPADDD 480(SP), Y3, Y3
  4008. VPXOR Y15, Y3, Y15
  4009. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4010. VPADDD Y11, Y15, Y11
  4011. VPXOR Y7, Y11, Y7
  4012. VPSRLD $0x0c, Y7, Y8
  4013. VPSLLD $0x14, Y7, Y7
  4014. VPOR Y7, Y8, Y7
  4015. VPADDD Y3, Y7, Y3
  4016. VPADDD 32(SP), Y3, Y3
  4017. VPXOR Y15, Y3, Y15
  4018. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4019. VPADDD Y11, Y15, Y11
  4020. VPXOR Y7, Y11, Y7
  4021. VPSRLD $0x07, Y7, Y8
  4022. VPSLLD $0x19, Y7, Y7
  4023. VPOR Y7, Y8, Y7
  4024. VPADDD Y0, Y5, Y0
  4025. VPADDD 416(SP), Y0, Y0
  4026. VPXOR Y15, Y0, Y15
  4027. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4028. VPADDD Y10, Y15, Y10
  4029. VPXOR Y5, Y10, Y5
  4030. VPSRLD $0x0c, Y5, Y8
  4031. VPSLLD $0x14, Y5, Y5
  4032. VPOR Y5, Y8, Y5
  4033. VPADDD Y0, Y5, Y0
  4034. VPADDD 96(SP), Y0, Y0
  4035. VPXOR Y15, Y0, Y15
  4036. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4037. VPADDD Y10, Y15, Y10
  4038. VPXOR Y5, Y10, Y5
  4039. VPSRLD $0x07, Y5, Y8
  4040. VPSLLD $0x19, Y5, Y5
  4041. VPOR Y5, Y8, Y5
  4042. VPADDD Y1, Y6, Y1
  4043. VPADDD (SP), Y1, Y1
  4044. VPXOR Y12, Y1, Y12
  4045. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4046. VPADDD Y11, Y12, Y11
  4047. VPXOR Y6, Y11, Y6
  4048. VPSRLD $0x0c, Y6, Y8
  4049. VPSLLD $0x14, Y6, Y6
  4050. VPOR Y6, Y8, Y6
  4051. VPADDD Y1, Y6, Y1
  4052. VPADDD 320(SP), Y1, Y1
  4053. VPXOR Y12, Y1, Y12
  4054. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4055. VPADDD Y11, Y12, Y11
  4056. VPXOR Y6, Y11, Y6
  4057. VPSRLD $0x07, Y6, Y8
  4058. VPSLLD $0x19, Y6, Y6
  4059. VPOR Y6, Y8, Y6
  4060. VPADDD Y2, Y7, Y2
  4061. VPADDD 64(SP), Y2, Y2
  4062. VPXOR Y13, Y2, Y13
  4063. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4064. VMOVDQU 640(SP), Y8
  4065. VPADDD Y8, Y13, Y8
  4066. VPXOR Y7, Y8, Y7
  4067. VMOVDQU Y8, 640(SP)
  4068. VPSRLD $0x0c, Y7, Y8
  4069. VPSLLD $0x14, Y7, Y7
  4070. VPOR Y7, Y8, Y7
  4071. VPADDD Y2, Y7, Y2
  4072. VPADDD 192(SP), Y2, Y2
  4073. VPXOR Y13, Y2, Y13
  4074. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4075. VMOVDQU 640(SP), Y8
  4076. VPADDD Y8, Y13, Y8
  4077. VPXOR Y7, Y8, Y7
  4078. VMOVDQU Y8, 640(SP)
  4079. VPSRLD $0x07, Y7, Y8
  4080. VPSLLD $0x19, Y7, Y7
  4081. VPOR Y7, Y8, Y7
  4082. VPADDD Y3, Y4, Y3
  4083. VPADDD 128(SP), Y3, Y3
  4084. VPXOR Y14, Y3, Y14
  4085. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4086. VPADDD Y9, Y14, Y9
  4087. VPXOR Y4, Y9, Y4
  4088. VPSRLD $0x0c, Y4, Y8
  4089. VPSLLD $0x14, Y4, Y4
  4090. VPOR Y4, Y8, Y4
  4091. VPADDD Y3, Y4, Y3
  4092. VPADDD 224(SP), Y3, Y3
  4093. VPXOR Y14, Y3, Y14
  4094. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4095. VPADDD Y9, Y14, Y9
  4096. VPXOR Y4, Y9, Y4
  4097. VPSRLD $0x07, Y4, Y8
  4098. VPSLLD $0x19, Y4, Y4
  4099. VPOR Y4, Y8, Y4
  4100. // Round 7
  4101. VPADDD Y0, Y4, Y0
  4102. VPADDD 352(SP), Y0, Y0
  4103. VPXOR Y12, Y0, Y12
  4104. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4105. VMOVDQU 640(SP), Y8
  4106. VPADDD Y8, Y12, Y8
  4107. VPXOR Y4, Y8, Y4
  4108. VMOVDQU Y8, 640(SP)
  4109. VPSRLD $0x0c, Y4, Y8
  4110. VPSLLD $0x14, Y4, Y4
  4111. VPOR Y4, Y8, Y4
  4112. VPADDD Y0, Y4, Y0
  4113. VPADDD 480(SP), Y0, Y0
  4114. VPXOR Y12, Y0, Y12
  4115. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4116. VMOVDQU 640(SP), Y8
  4117. VPADDD Y8, Y12, Y8
  4118. VPXOR Y4, Y8, Y4
  4119. VMOVDQU Y8, 640(SP)
  4120. VPSRLD $0x07, Y4, Y8
  4121. VPSLLD $0x19, Y4, Y4
  4122. VPOR Y4, Y8, Y4
  4123. VPADDD Y1, Y5, Y1
  4124. VPADDD 160(SP), Y1, Y1
  4125. VPXOR Y13, Y1, Y13
  4126. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4127. VPADDD Y9, Y13, Y9
  4128. VPXOR Y5, Y9, Y5
  4129. VPSRLD $0x0c, Y5, Y8
  4130. VPSLLD $0x14, Y5, Y5
  4131. VPOR Y5, Y8, Y5
  4132. VPADDD Y1, Y5, Y1
  4133. VPADDD (SP), Y1, Y1
  4134. VPXOR Y13, Y1, Y13
  4135. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4136. VPADDD Y9, Y13, Y9
  4137. VPXOR Y5, Y9, Y5
  4138. VPSRLD $0x07, Y5, Y8
  4139. VPSLLD $0x19, Y5, Y5
  4140. VPOR Y5, Y8, Y5
  4141. VPADDD Y2, Y6, Y2
  4142. VPADDD 32(SP), Y2, Y2
  4143. VPXOR Y14, Y2, Y14
  4144. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4145. VPADDD Y10, Y14, Y10
  4146. VPXOR Y6, Y10, Y6
  4147. VPSRLD $0x0c, Y6, Y8
  4148. VPSLLD $0x14, Y6, Y6
  4149. VPOR Y6, Y8, Y6
  4150. VPADDD Y2, Y6, Y2
  4151. VPADDD 288(SP), Y2, Y2
  4152. VPXOR Y14, Y2, Y14
  4153. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4154. VPADDD Y10, Y14, Y10
  4155. VPXOR Y6, Y10, Y6
  4156. VPSRLD $0x07, Y6, Y8
  4157. VPSLLD $0x19, Y6, Y6
  4158. VPOR Y6, Y8, Y6
  4159. VPADDD Y3, Y7, Y3
  4160. VPADDD 256(SP), Y3, Y3
  4161. VPXOR Y15, Y3, Y15
  4162. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4163. VPADDD Y11, Y15, Y11
  4164. VPXOR Y7, Y11, Y7
  4165. VPSRLD $0x0c, Y7, Y8
  4166. VPSLLD $0x14, Y7, Y7
  4167. VPOR Y7, Y8, Y7
  4168. VPADDD Y3, Y7, Y3
  4169. VPADDD 192(SP), Y3, Y3
  4170. VPXOR Y15, Y3, Y15
  4171. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4172. VPADDD Y11, Y15, Y11
  4173. VPXOR Y7, Y11, Y7
  4174. VPSRLD $0x07, Y7, Y8
  4175. VPSLLD $0x19, Y7, Y7
  4176. VPOR Y7, Y8, Y7
  4177. VPADDD Y0, Y5, Y0
  4178. VPADDD 448(SP), Y0, Y0
  4179. VPXOR Y15, Y0, Y15
  4180. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4181. VPADDD Y10, Y15, Y10
  4182. VPXOR Y5, Y10, Y5
  4183. VPSRLD $0x0c, Y5, Y8
  4184. VPSLLD $0x14, Y5, Y5
  4185. VPOR Y5, Y8, Y5
  4186. VPADDD Y0, Y5, Y0
  4187. VPADDD 320(SP), Y0, Y0
  4188. VPXOR Y15, Y0, Y15
  4189. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4190. VPADDD Y10, Y15, Y10
  4191. VPXOR Y5, Y10, Y5
  4192. VPSRLD $0x07, Y5, Y8
  4193. VPSLLD $0x19, Y5, Y5
  4194. VPOR Y5, Y8, Y5
  4195. VPADDD Y1, Y6, Y1
  4196. VPADDD 64(SP), Y1, Y1
  4197. VPXOR Y12, Y1, Y12
  4198. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4199. VPADDD Y11, Y12, Y11
  4200. VPXOR Y6, Y11, Y6
  4201. VPSRLD $0x0c, Y6, Y8
  4202. VPSLLD $0x14, Y6, Y6
  4203. VPOR Y6, Y8, Y6
  4204. VPADDD Y1, Y6, Y1
  4205. VPADDD 384(SP), Y1, Y1
  4206. VPXOR Y12, Y1, Y12
  4207. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4208. VPADDD Y11, Y12, Y11
  4209. VPXOR Y6, Y11, Y6
  4210. VPSRLD $0x07, Y6, Y8
  4211. VPSLLD $0x19, Y6, Y6
  4212. VPOR Y6, Y8, Y6
  4213. VPADDD Y2, Y7, Y2
  4214. VPADDD 96(SP), Y2, Y2
  4215. VPXOR Y13, Y2, Y13
  4216. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4217. VMOVDQU 640(SP), Y8
  4218. VPADDD Y8, Y13, Y8
  4219. VPXOR Y7, Y8, Y7
  4220. VMOVDQU Y8, 640(SP)
  4221. VPSRLD $0x0c, Y7, Y8
  4222. VPSLLD $0x14, Y7, Y7
  4223. VPOR Y7, Y8, Y7
  4224. VPADDD Y2, Y7, Y2
  4225. VPADDD 128(SP), Y2, Y2
  4226. VPXOR Y13, Y2, Y13
  4227. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4228. VMOVDQU 640(SP), Y8
  4229. VPADDD Y8, Y13, Y8
  4230. VPXOR Y7, Y8, Y7
  4231. VMOVDQU Y8, 640(SP)
  4232. VPSRLD $0x07, Y7, Y8
  4233. VPSLLD $0x19, Y7, Y7
  4234. VPOR Y7, Y8, Y7
  4235. VPADDD Y3, Y4, Y3
  4236. VPADDD 224(SP), Y3, Y3
  4237. VPXOR Y14, Y3, Y14
  4238. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4239. VPADDD Y9, Y14, Y9
  4240. VPXOR Y4, Y9, Y4
  4241. VPSRLD $0x0c, Y4, Y8
  4242. VPSLLD $0x14, Y4, Y4
  4243. VPOR Y4, Y8, Y4
  4244. VPADDD Y3, Y4, Y3
  4245. VPADDD 416(SP), Y3, Y3
  4246. VPXOR Y14, Y3, Y14
  4247. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4248. VPADDD Y9, Y14, Y9
  4249. VPXOR Y4, Y9, Y4
  4250. VPSRLD $0x07, Y4, Y8
  4251. VPSLLD $0x19, Y4, Y4
  4252. VPOR Y4, Y8, Y4
  4253. VMOVDQU 640(SP), Y8
  4254. // Finalize CVs
  4255. VPXOR Y0, Y8, Y0
  4256. VPXOR Y1, Y9, Y1
  4257. VPXOR Y2, Y10, Y2
  4258. VPXOR Y3, Y11, Y3
  4259. VPXOR Y4, Y12, Y4
  4260. VPXOR Y5, Y13, Y5
  4261. VPXOR Y6, Y14, Y6
  4262. VPXOR Y7, Y15, Y7
  4263. // Loop
  4264. INCQ DX
  4265. CMPQ DX, $0x00000010
  4266. JNE loop
  4267. // Finished; transpose CVs
  4268. VPUNPCKLDQ Y1, Y0, Y8
  4269. VPUNPCKHDQ Y1, Y0, Y9
  4270. VPUNPCKLDQ Y3, Y2, Y10
  4271. VPUNPCKHDQ Y3, Y2, Y11
  4272. VPUNPCKLDQ Y5, Y4, Y12
  4273. VPUNPCKHDQ Y5, Y4, Y13
  4274. VPUNPCKLDQ Y7, Y6, Y14
  4275. VPUNPCKHDQ Y7, Y6, Y15
  4276. VPUNPCKLQDQ Y10, Y8, Y0
  4277. VPUNPCKHQDQ Y10, Y8, Y1
  4278. VPUNPCKLQDQ Y11, Y9, Y2
  4279. VPUNPCKHQDQ Y11, Y9, Y3
  4280. VPUNPCKLQDQ Y14, Y12, Y4
  4281. VPUNPCKHQDQ Y14, Y12, Y5
  4282. VPUNPCKLQDQ Y15, Y13, Y6
  4283. VPUNPCKHQDQ Y15, Y13, Y7
  4284. VPERM2I128 $0x20, Y4, Y0, Y8
  4285. VPERM2I128 $0x31, Y4, Y0, Y12
  4286. VPERM2I128 $0x20, Y5, Y1, Y9
  4287. VPERM2I128 $0x31, Y5, Y1, Y13
  4288. VPERM2I128 $0x20, Y6, Y2, Y10
  4289. VPERM2I128 $0x31, Y6, Y2, Y14
  4290. VPERM2I128 $0x20, Y7, Y3, Y11
  4291. VPERM2I128 $0x31, Y7, Y3, Y15
  4292. VMOVDQU Y8, (AX)
  4293. VMOVDQU Y9, 32(AX)
  4294. VMOVDQU Y10, 64(AX)
  4295. VMOVDQU Y11, 96(AX)
  4296. VMOVDQU Y12, 128(AX)
  4297. VMOVDQU Y13, 160(AX)
  4298. VMOVDQU Y14, 192(AX)
  4299. VMOVDQU Y15, 224(AX)
  4300. RET
  4301. // func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
  4302. // Requires: AVX, AVX2
  4303. TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-32
  4304. MOVQ parents+0(FP), AX
  4305. MOVQ cvs+8(FP), CX
  4306. MOVQ key+16(FP), DX
  4307. // Load transposed block
  4308. VMOVDQU seq<>+0(SB), Y9
  4309. VPSLLD $0x06, Y9, Y9
  4310. VPCMPEQD Y8, Y8, Y8
  4311. VPGATHERDD Y8, (CX)(Y9*1), Y10
  4312. VMOVDQU Y10, (SP)
  4313. VPCMPEQD Y8, Y8, Y8
  4314. VPGATHERDD Y8, 4(CX)(Y9*1), Y10
  4315. VMOVDQU Y10, 32(SP)
  4316. VPCMPEQD Y8, Y8, Y8
  4317. VPGATHERDD Y8, 8(CX)(Y9*1), Y10
  4318. VMOVDQU Y10, 64(SP)
  4319. VPCMPEQD Y8, Y8, Y8
  4320. VPGATHERDD Y8, 12(CX)(Y9*1), Y10
  4321. VMOVDQU Y10, 96(SP)
  4322. VPCMPEQD Y8, Y8, Y8
  4323. VPGATHERDD Y8, 16(CX)(Y9*1), Y10
  4324. VMOVDQU Y10, 128(SP)
  4325. VPCMPEQD Y8, Y8, Y8
  4326. VPGATHERDD Y8, 20(CX)(Y9*1), Y10
  4327. VMOVDQU Y10, 160(SP)
  4328. VPCMPEQD Y8, Y8, Y8
  4329. VPGATHERDD Y8, 24(CX)(Y9*1), Y10
  4330. VMOVDQU Y10, 192(SP)
  4331. VPCMPEQD Y8, Y8, Y8
  4332. VPGATHERDD Y8, 28(CX)(Y9*1), Y10
  4333. VMOVDQU Y10, 224(SP)
  4334. VPCMPEQD Y8, Y8, Y8
  4335. VPGATHERDD Y8, 32(CX)(Y9*1), Y10
  4336. VMOVDQU Y10, 256(SP)
  4337. VPCMPEQD Y8, Y8, Y8
  4338. VPGATHERDD Y8, 36(CX)(Y9*1), Y10
  4339. VMOVDQU Y10, 288(SP)
  4340. VPCMPEQD Y8, Y8, Y8
  4341. VPGATHERDD Y8, 40(CX)(Y9*1), Y10
  4342. VMOVDQU Y10, 320(SP)
  4343. VPCMPEQD Y8, Y8, Y8
  4344. VPGATHERDD Y8, 44(CX)(Y9*1), Y10
  4345. VMOVDQU Y10, 352(SP)
  4346. VPCMPEQD Y8, Y8, Y8
  4347. VPGATHERDD Y8, 48(CX)(Y9*1), Y10
  4348. VMOVDQU Y10, 384(SP)
  4349. VPCMPEQD Y8, Y8, Y8
  4350. VPGATHERDD Y8, 52(CX)(Y9*1), Y10
  4351. VMOVDQU Y10, 416(SP)
  4352. VPCMPEQD Y8, Y8, Y8
  4353. VPGATHERDD Y8, 56(CX)(Y9*1), Y10
  4354. VMOVDQU Y10, 448(SP)
  4355. VPCMPEQD Y8, Y8, Y8
  4356. VPGATHERDD Y8, 60(CX)(Y9*1), Y10
  4357. VMOVDQU Y10, 480(SP)
  4358. // Initialize state vectors
  4359. VPBROADCASTD (DX), Y0
  4360. VPBROADCASTD 4(DX), Y1
  4361. VPBROADCASTD 8(DX), Y2
  4362. VPBROADCASTD 12(DX), Y3
  4363. VPBROADCASTD 16(DX), Y4
  4364. VPBROADCASTD 20(DX), Y5
  4365. VPBROADCASTD 24(DX), Y6
  4366. VPBROADCASTD 28(DX), Y7
  4367. VPBROADCASTD iv<>+0(SB), Y8
  4368. VPBROADCASTD iv<>+4(SB), Y9
  4369. VPBROADCASTD iv<>+8(SB), Y10
  4370. VPBROADCASTD iv<>+12(SB), Y11
  4371. VPXOR Y12, Y12, Y12
  4372. VPXOR Y13, Y13, Y13
  4373. VPBROADCASTD seq<>+4(SB), Y14
  4374. VPSLLD $0x06, Y14, Y14
  4375. ORL $0x04, flags+24(FP)
  4376. VPBROADCASTD flags+24(FP), Y15
  4377. VMOVDQU Y8, 512(SP)
  4378. // Round 1
  4379. VPADDD Y0, Y4, Y0
  4380. VPADDD (SP), Y0, Y0
  4381. VPXOR Y12, Y0, Y12
  4382. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4383. VMOVDQU 512(SP), Y8
  4384. VPADDD Y8, Y12, Y8
  4385. VPXOR Y4, Y8, Y4
  4386. VMOVDQU Y8, 512(SP)
  4387. VPSRLD $0x0c, Y4, Y8
  4388. VPSLLD $0x14, Y4, Y4
  4389. VPOR Y4, Y8, Y4
  4390. VPADDD Y0, Y4, Y0
  4391. VPADDD 32(SP), Y0, Y0
  4392. VPXOR Y12, Y0, Y12
  4393. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4394. VMOVDQU 512(SP), Y8
  4395. VPADDD Y8, Y12, Y8
  4396. VPXOR Y4, Y8, Y4
  4397. VMOVDQU Y8, 512(SP)
  4398. VPSRLD $0x07, Y4, Y8
  4399. VPSLLD $0x19, Y4, Y4
  4400. VPOR Y4, Y8, Y4
  4401. VPADDD Y1, Y5, Y1
  4402. VPADDD 64(SP), Y1, Y1
  4403. VPXOR Y13, Y1, Y13
  4404. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4405. VPADDD Y9, Y13, Y9
  4406. VPXOR Y5, Y9, Y5
  4407. VPSRLD $0x0c, Y5, Y8
  4408. VPSLLD $0x14, Y5, Y5
  4409. VPOR Y5, Y8, Y5
  4410. VPADDD Y1, Y5, Y1
  4411. VPADDD 96(SP), Y1, Y1
  4412. VPXOR Y13, Y1, Y13
  4413. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4414. VPADDD Y9, Y13, Y9
  4415. VPXOR Y5, Y9, Y5
  4416. VPSRLD $0x07, Y5, Y8
  4417. VPSLLD $0x19, Y5, Y5
  4418. VPOR Y5, Y8, Y5
  4419. VPADDD Y2, Y6, Y2
  4420. VPADDD 128(SP), Y2, Y2
  4421. VPXOR Y14, Y2, Y14
  4422. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4423. VPADDD Y10, Y14, Y10
  4424. VPXOR Y6, Y10, Y6
  4425. VPSRLD $0x0c, Y6, Y8
  4426. VPSLLD $0x14, Y6, Y6
  4427. VPOR Y6, Y8, Y6
  4428. VPADDD Y2, Y6, Y2
  4429. VPADDD 160(SP), Y2, Y2
  4430. VPXOR Y14, Y2, Y14
  4431. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4432. VPADDD Y10, Y14, Y10
  4433. VPXOR Y6, Y10, Y6
  4434. VPSRLD $0x07, Y6, Y8
  4435. VPSLLD $0x19, Y6, Y6
  4436. VPOR Y6, Y8, Y6
  4437. VPADDD Y3, Y7, Y3
  4438. VPADDD 192(SP), Y3, Y3
  4439. VPXOR Y15, Y3, Y15
  4440. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4441. VPADDD Y11, Y15, Y11
  4442. VPXOR Y7, Y11, Y7
  4443. VPSRLD $0x0c, Y7, Y8
  4444. VPSLLD $0x14, Y7, Y7
  4445. VPOR Y7, Y8, Y7
  4446. VPADDD Y3, Y7, Y3
  4447. VPADDD 224(SP), Y3, Y3
  4448. VPXOR Y15, Y3, Y15
  4449. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4450. VPADDD Y11, Y15, Y11
  4451. VPXOR Y7, Y11, Y7
  4452. VPSRLD $0x07, Y7, Y8
  4453. VPSLLD $0x19, Y7, Y7
  4454. VPOR Y7, Y8, Y7
  4455. VPADDD Y0, Y5, Y0
  4456. VPADDD 256(SP), Y0, Y0
  4457. VPXOR Y15, Y0, Y15
  4458. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4459. VPADDD Y10, Y15, Y10
  4460. VPXOR Y5, Y10, Y5
  4461. VPSRLD $0x0c, Y5, Y8
  4462. VPSLLD $0x14, Y5, Y5
  4463. VPOR Y5, Y8, Y5
  4464. VPADDD Y0, Y5, Y0
  4465. VPADDD 288(SP), Y0, Y0
  4466. VPXOR Y15, Y0, Y15
  4467. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4468. VPADDD Y10, Y15, Y10
  4469. VPXOR Y5, Y10, Y5
  4470. VPSRLD $0x07, Y5, Y8
  4471. VPSLLD $0x19, Y5, Y5
  4472. VPOR Y5, Y8, Y5
  4473. VPADDD Y1, Y6, Y1
  4474. VPADDD 320(SP), Y1, Y1
  4475. VPXOR Y12, Y1, Y12
  4476. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4477. VPADDD Y11, Y12, Y11
  4478. VPXOR Y6, Y11, Y6
  4479. VPSRLD $0x0c, Y6, Y8
  4480. VPSLLD $0x14, Y6, Y6
  4481. VPOR Y6, Y8, Y6
  4482. VPADDD Y1, Y6, Y1
  4483. VPADDD 352(SP), Y1, Y1
  4484. VPXOR Y12, Y1, Y12
  4485. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4486. VPADDD Y11, Y12, Y11
  4487. VPXOR Y6, Y11, Y6
  4488. VPSRLD $0x07, Y6, Y8
  4489. VPSLLD $0x19, Y6, Y6
  4490. VPOR Y6, Y8, Y6
  4491. VPADDD Y2, Y7, Y2
  4492. VPADDD 384(SP), Y2, Y2
  4493. VPXOR Y13, Y2, Y13
  4494. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4495. VMOVDQU 512(SP), Y8
  4496. VPADDD Y8, Y13, Y8
  4497. VPXOR Y7, Y8, Y7
  4498. VMOVDQU Y8, 512(SP)
  4499. VPSRLD $0x0c, Y7, Y8
  4500. VPSLLD $0x14, Y7, Y7
  4501. VPOR Y7, Y8, Y7
  4502. VPADDD Y2, Y7, Y2
  4503. VPADDD 416(SP), Y2, Y2
  4504. VPXOR Y13, Y2, Y13
  4505. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4506. VMOVDQU 512(SP), Y8
  4507. VPADDD Y8, Y13, Y8
  4508. VPXOR Y7, Y8, Y7
  4509. VMOVDQU Y8, 512(SP)
  4510. VPSRLD $0x07, Y7, Y8
  4511. VPSLLD $0x19, Y7, Y7
  4512. VPOR Y7, Y8, Y7
  4513. VPADDD Y3, Y4, Y3
  4514. VPADDD 448(SP), Y3, Y3
  4515. VPXOR Y14, Y3, Y14
  4516. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4517. VPADDD Y9, Y14, Y9
  4518. VPXOR Y4, Y9, Y4
  4519. VPSRLD $0x0c, Y4, Y8
  4520. VPSLLD $0x14, Y4, Y4
  4521. VPOR Y4, Y8, Y4
  4522. VPADDD Y3, Y4, Y3
  4523. VPADDD 480(SP), Y3, Y3
  4524. VPXOR Y14, Y3, Y14
  4525. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4526. VPADDD Y9, Y14, Y9
  4527. VPXOR Y4, Y9, Y4
  4528. VPSRLD $0x07, Y4, Y8
  4529. VPSLLD $0x19, Y4, Y4
  4530. VPOR Y4, Y8, Y4
  4531. // Round 2
  4532. VPADDD Y0, Y4, Y0
  4533. VPADDD 64(SP), Y0, Y0
  4534. VPXOR Y12, Y0, Y12
  4535. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4536. VMOVDQU 512(SP), Y8
  4537. VPADDD Y8, Y12, Y8
  4538. VPXOR Y4, Y8, Y4
  4539. VMOVDQU Y8, 512(SP)
  4540. VPSRLD $0x0c, Y4, Y8
  4541. VPSLLD $0x14, Y4, Y4
  4542. VPOR Y4, Y8, Y4
  4543. VPADDD Y0, Y4, Y0
  4544. VPADDD 192(SP), Y0, Y0
  4545. VPXOR Y12, Y0, Y12
  4546. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4547. VMOVDQU 512(SP), Y8
  4548. VPADDD Y8, Y12, Y8
  4549. VPXOR Y4, Y8, Y4
  4550. VMOVDQU Y8, 512(SP)
  4551. VPSRLD $0x07, Y4, Y8
  4552. VPSLLD $0x19, Y4, Y4
  4553. VPOR Y4, Y8, Y4
  4554. VPADDD Y1, Y5, Y1
  4555. VPADDD 96(SP), Y1, Y1
  4556. VPXOR Y13, Y1, Y13
  4557. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4558. VPADDD Y9, Y13, Y9
  4559. VPXOR Y5, Y9, Y5
  4560. VPSRLD $0x0c, Y5, Y8
  4561. VPSLLD $0x14, Y5, Y5
  4562. VPOR Y5, Y8, Y5
  4563. VPADDD Y1, Y5, Y1
  4564. VPADDD 320(SP), Y1, Y1
  4565. VPXOR Y13, Y1, Y13
  4566. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4567. VPADDD Y9, Y13, Y9
  4568. VPXOR Y5, Y9, Y5
  4569. VPSRLD $0x07, Y5, Y8
  4570. VPSLLD $0x19, Y5, Y5
  4571. VPOR Y5, Y8, Y5
  4572. VPADDD Y2, Y6, Y2
  4573. VPADDD 224(SP), Y2, Y2
  4574. VPXOR Y14, Y2, Y14
  4575. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4576. VPADDD Y10, Y14, Y10
  4577. VPXOR Y6, Y10, Y6
  4578. VPSRLD $0x0c, Y6, Y8
  4579. VPSLLD $0x14, Y6, Y6
  4580. VPOR Y6, Y8, Y6
  4581. VPADDD Y2, Y6, Y2
  4582. VPADDD (SP), Y2, Y2
  4583. VPXOR Y14, Y2, Y14
  4584. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4585. VPADDD Y10, Y14, Y10
  4586. VPXOR Y6, Y10, Y6
  4587. VPSRLD $0x07, Y6, Y8
  4588. VPSLLD $0x19, Y6, Y6
  4589. VPOR Y6, Y8, Y6
  4590. VPADDD Y3, Y7, Y3
  4591. VPADDD 128(SP), Y3, Y3
  4592. VPXOR Y15, Y3, Y15
  4593. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4594. VPADDD Y11, Y15, Y11
  4595. VPXOR Y7, Y11, Y7
  4596. VPSRLD $0x0c, Y7, Y8
  4597. VPSLLD $0x14, Y7, Y7
  4598. VPOR Y7, Y8, Y7
  4599. VPADDD Y3, Y7, Y3
  4600. VPADDD 416(SP), Y3, Y3
  4601. VPXOR Y15, Y3, Y15
  4602. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4603. VPADDD Y11, Y15, Y11
  4604. VPXOR Y7, Y11, Y7
  4605. VPSRLD $0x07, Y7, Y8
  4606. VPSLLD $0x19, Y7, Y7
  4607. VPOR Y7, Y8, Y7
  4608. VPADDD Y0, Y5, Y0
  4609. VPADDD 32(SP), Y0, Y0
  4610. VPXOR Y15, Y0, Y15
  4611. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4612. VPADDD Y10, Y15, Y10
  4613. VPXOR Y5, Y10, Y5
  4614. VPSRLD $0x0c, Y5, Y8
  4615. VPSLLD $0x14, Y5, Y5
  4616. VPOR Y5, Y8, Y5
  4617. VPADDD Y0, Y5, Y0
  4618. VPADDD 352(SP), Y0, Y0
  4619. VPXOR Y15, Y0, Y15
  4620. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4621. VPADDD Y10, Y15, Y10
  4622. VPXOR Y5, Y10, Y5
  4623. VPSRLD $0x07, Y5, Y8
  4624. VPSLLD $0x19, Y5, Y5
  4625. VPOR Y5, Y8, Y5
  4626. VPADDD Y1, Y6, Y1
  4627. VPADDD 384(SP), Y1, Y1
  4628. VPXOR Y12, Y1, Y12
  4629. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4630. VPADDD Y11, Y12, Y11
  4631. VPXOR Y6, Y11, Y6
  4632. VPSRLD $0x0c, Y6, Y8
  4633. VPSLLD $0x14, Y6, Y6
  4634. VPOR Y6, Y8, Y6
  4635. VPADDD Y1, Y6, Y1
  4636. VPADDD 160(SP), Y1, Y1
  4637. VPXOR Y12, Y1, Y12
  4638. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4639. VPADDD Y11, Y12, Y11
  4640. VPXOR Y6, Y11, Y6
  4641. VPSRLD $0x07, Y6, Y8
  4642. VPSLLD $0x19, Y6, Y6
  4643. VPOR Y6, Y8, Y6
  4644. VPADDD Y2, Y7, Y2
  4645. VPADDD 288(SP), Y2, Y2
  4646. VPXOR Y13, Y2, Y13
  4647. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4648. VMOVDQU 512(SP), Y8
  4649. VPADDD Y8, Y13, Y8
  4650. VPXOR Y7, Y8, Y7
  4651. VMOVDQU Y8, 512(SP)
  4652. VPSRLD $0x0c, Y7, Y8
  4653. VPSLLD $0x14, Y7, Y7
  4654. VPOR Y7, Y8, Y7
  4655. VPADDD Y2, Y7, Y2
  4656. VPADDD 448(SP), Y2, Y2
  4657. VPXOR Y13, Y2, Y13
  4658. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4659. VMOVDQU 512(SP), Y8
  4660. VPADDD Y8, Y13, Y8
  4661. VPXOR Y7, Y8, Y7
  4662. VMOVDQU Y8, 512(SP)
  4663. VPSRLD $0x07, Y7, Y8
  4664. VPSLLD $0x19, Y7, Y7
  4665. VPOR Y7, Y8, Y7
  4666. VPADDD Y3, Y4, Y3
  4667. VPADDD 480(SP), Y3, Y3
  4668. VPXOR Y14, Y3, Y14
  4669. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4670. VPADDD Y9, Y14, Y9
  4671. VPXOR Y4, Y9, Y4
  4672. VPSRLD $0x0c, Y4, Y8
  4673. VPSLLD $0x14, Y4, Y4
  4674. VPOR Y4, Y8, Y4
  4675. VPADDD Y3, Y4, Y3
  4676. VPADDD 256(SP), Y3, Y3
  4677. VPXOR Y14, Y3, Y14
  4678. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4679. VPADDD Y9, Y14, Y9
  4680. VPXOR Y4, Y9, Y4
  4681. VPSRLD $0x07, Y4, Y8
  4682. VPSLLD $0x19, Y4, Y4
  4683. VPOR Y4, Y8, Y4
  4684. // Round 3
  4685. VPADDD Y0, Y4, Y0
  4686. VPADDD 96(SP), Y0, Y0
  4687. VPXOR Y12, Y0, Y12
  4688. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4689. VMOVDQU 512(SP), Y8
  4690. VPADDD Y8, Y12, Y8
  4691. VPXOR Y4, Y8, Y4
  4692. VMOVDQU Y8, 512(SP)
  4693. VPSRLD $0x0c, Y4, Y8
  4694. VPSLLD $0x14, Y4, Y4
  4695. VPOR Y4, Y8, Y4
  4696. VPADDD Y0, Y4, Y0
  4697. VPADDD 128(SP), Y0, Y0
  4698. VPXOR Y12, Y0, Y12
  4699. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4700. VMOVDQU 512(SP), Y8
  4701. VPADDD Y8, Y12, Y8
  4702. VPXOR Y4, Y8, Y4
  4703. VMOVDQU Y8, 512(SP)
  4704. VPSRLD $0x07, Y4, Y8
  4705. VPSLLD $0x19, Y4, Y4
  4706. VPOR Y4, Y8, Y4
  4707. VPADDD Y1, Y5, Y1
  4708. VPADDD 320(SP), Y1, Y1
  4709. VPXOR Y13, Y1, Y13
  4710. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4711. VPADDD Y9, Y13, Y9
  4712. VPXOR Y5, Y9, Y5
  4713. VPSRLD $0x0c, Y5, Y8
  4714. VPSLLD $0x14, Y5, Y5
  4715. VPOR Y5, Y8, Y5
  4716. VPADDD Y1, Y5, Y1
  4717. VPADDD 384(SP), Y1, Y1
  4718. VPXOR Y13, Y1, Y13
  4719. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4720. VPADDD Y9, Y13, Y9
  4721. VPXOR Y5, Y9, Y5
  4722. VPSRLD $0x07, Y5, Y8
  4723. VPSLLD $0x19, Y5, Y5
  4724. VPOR Y5, Y8, Y5
  4725. VPADDD Y2, Y6, Y2
  4726. VPADDD 416(SP), Y2, Y2
  4727. VPXOR Y14, Y2, Y14
  4728. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4729. VPADDD Y10, Y14, Y10
  4730. VPXOR Y6, Y10, Y6
  4731. VPSRLD $0x0c, Y6, Y8
  4732. VPSLLD $0x14, Y6, Y6
  4733. VPOR Y6, Y8, Y6
  4734. VPADDD Y2, Y6, Y2
  4735. VPADDD 64(SP), Y2, Y2
  4736. VPXOR Y14, Y2, Y14
  4737. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4738. VPADDD Y10, Y14, Y10
  4739. VPXOR Y6, Y10, Y6
  4740. VPSRLD $0x07, Y6, Y8
  4741. VPSLLD $0x19, Y6, Y6
  4742. VPOR Y6, Y8, Y6
  4743. VPADDD Y3, Y7, Y3
  4744. VPADDD 224(SP), Y3, Y3
  4745. VPXOR Y15, Y3, Y15
  4746. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4747. VPADDD Y11, Y15, Y11
  4748. VPXOR Y7, Y11, Y7
  4749. VPSRLD $0x0c, Y7, Y8
  4750. VPSLLD $0x14, Y7, Y7
  4751. VPOR Y7, Y8, Y7
  4752. VPADDD Y3, Y7, Y3
  4753. VPADDD 448(SP), Y3, Y3
  4754. VPXOR Y15, Y3, Y15
  4755. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4756. VPADDD Y11, Y15, Y11
  4757. VPXOR Y7, Y11, Y7
  4758. VPSRLD $0x07, Y7, Y8
  4759. VPSLLD $0x19, Y7, Y7
  4760. VPOR Y7, Y8, Y7
  4761. VPADDD Y0, Y5, Y0
  4762. VPADDD 192(SP), Y0, Y0
  4763. VPXOR Y15, Y0, Y15
  4764. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4765. VPADDD Y10, Y15, Y10
  4766. VPXOR Y5, Y10, Y5
  4767. VPSRLD $0x0c, Y5, Y8
  4768. VPSLLD $0x14, Y5, Y5
  4769. VPOR Y5, Y8, Y5
  4770. VPADDD Y0, Y5, Y0
  4771. VPADDD 160(SP), Y0, Y0
  4772. VPXOR Y15, Y0, Y15
  4773. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4774. VPADDD Y10, Y15, Y10
  4775. VPXOR Y5, Y10, Y5
  4776. VPSRLD $0x07, Y5, Y8
  4777. VPSLLD $0x19, Y5, Y5
  4778. VPOR Y5, Y8, Y5
  4779. VPADDD Y1, Y6, Y1
  4780. VPADDD 288(SP), Y1, Y1
  4781. VPXOR Y12, Y1, Y12
  4782. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4783. VPADDD Y11, Y12, Y11
  4784. VPXOR Y6, Y11, Y6
  4785. VPSRLD $0x0c, Y6, Y8
  4786. VPSLLD $0x14, Y6, Y6
  4787. VPOR Y6, Y8, Y6
  4788. VPADDD Y1, Y6, Y1
  4789. VPADDD (SP), Y1, Y1
  4790. VPXOR Y12, Y1, Y12
  4791. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4792. VPADDD Y11, Y12, Y11
  4793. VPXOR Y6, Y11, Y6
  4794. VPSRLD $0x07, Y6, Y8
  4795. VPSLLD $0x19, Y6, Y6
  4796. VPOR Y6, Y8, Y6
  4797. VPADDD Y2, Y7, Y2
  4798. VPADDD 352(SP), Y2, Y2
  4799. VPXOR Y13, Y2, Y13
  4800. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4801. VMOVDQU 512(SP), Y8
  4802. VPADDD Y8, Y13, Y8
  4803. VPXOR Y7, Y8, Y7
  4804. VMOVDQU Y8, 512(SP)
  4805. VPSRLD $0x0c, Y7, Y8
  4806. VPSLLD $0x14, Y7, Y7
  4807. VPOR Y7, Y8, Y7
  4808. VPADDD Y2, Y7, Y2
  4809. VPADDD 480(SP), Y2, Y2
  4810. VPXOR Y13, Y2, Y13
  4811. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4812. VMOVDQU 512(SP), Y8
  4813. VPADDD Y8, Y13, Y8
  4814. VPXOR Y7, Y8, Y7
  4815. VMOVDQU Y8, 512(SP)
  4816. VPSRLD $0x07, Y7, Y8
  4817. VPSLLD $0x19, Y7, Y7
  4818. VPOR Y7, Y8, Y7
  4819. VPADDD Y3, Y4, Y3
  4820. VPADDD 256(SP), Y3, Y3
  4821. VPXOR Y14, Y3, Y14
  4822. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4823. VPADDD Y9, Y14, Y9
  4824. VPXOR Y4, Y9, Y4
  4825. VPSRLD $0x0c, Y4, Y8
  4826. VPSLLD $0x14, Y4, Y4
  4827. VPOR Y4, Y8, Y4
  4828. VPADDD Y3, Y4, Y3
  4829. VPADDD 32(SP), Y3, Y3
  4830. VPXOR Y14, Y3, Y14
  4831. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4832. VPADDD Y9, Y14, Y9
  4833. VPXOR Y4, Y9, Y4
  4834. VPSRLD $0x07, Y4, Y8
  4835. VPSLLD $0x19, Y4, Y4
  4836. VPOR Y4, Y8, Y4
  4837. // Round 4
  4838. VPADDD Y0, Y4, Y0
  4839. VPADDD 320(SP), Y0, Y0
  4840. VPXOR Y12, Y0, Y12
  4841. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4842. VMOVDQU 512(SP), Y8
  4843. VPADDD Y8, Y12, Y8
  4844. VPXOR Y4, Y8, Y4
  4845. VMOVDQU Y8, 512(SP)
  4846. VPSRLD $0x0c, Y4, Y8
  4847. VPSLLD $0x14, Y4, Y4
  4848. VPOR Y4, Y8, Y4
  4849. VPADDD Y0, Y4, Y0
  4850. VPADDD 224(SP), Y0, Y0
  4851. VPXOR Y12, Y0, Y12
  4852. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4853. VMOVDQU 512(SP), Y8
  4854. VPADDD Y8, Y12, Y8
  4855. VPXOR Y4, Y8, Y4
  4856. VMOVDQU Y8, 512(SP)
  4857. VPSRLD $0x07, Y4, Y8
  4858. VPSLLD $0x19, Y4, Y4
  4859. VPOR Y4, Y8, Y4
  4860. VPADDD Y1, Y5, Y1
  4861. VPADDD 384(SP), Y1, Y1
  4862. VPXOR Y13, Y1, Y13
  4863. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4864. VPADDD Y9, Y13, Y9
  4865. VPXOR Y5, Y9, Y5
  4866. VPSRLD $0x0c, Y5, Y8
  4867. VPSLLD $0x14, Y5, Y5
  4868. VPOR Y5, Y8, Y5
  4869. VPADDD Y1, Y5, Y1
  4870. VPADDD 288(SP), Y1, Y1
  4871. VPXOR Y13, Y1, Y13
  4872. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4873. VPADDD Y9, Y13, Y9
  4874. VPXOR Y5, Y9, Y5
  4875. VPSRLD $0x07, Y5, Y8
  4876. VPSLLD $0x19, Y5, Y5
  4877. VPOR Y5, Y8, Y5
  4878. VPADDD Y2, Y6, Y2
  4879. VPADDD 448(SP), Y2, Y2
  4880. VPXOR Y14, Y2, Y14
  4881. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4882. VPADDD Y10, Y14, Y10
  4883. VPXOR Y6, Y10, Y6
  4884. VPSRLD $0x0c, Y6, Y8
  4885. VPSLLD $0x14, Y6, Y6
  4886. VPOR Y6, Y8, Y6
  4887. VPADDD Y2, Y6, Y2
  4888. VPADDD 96(SP), Y2, Y2
  4889. VPXOR Y14, Y2, Y14
  4890. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4891. VPADDD Y10, Y14, Y10
  4892. VPXOR Y6, Y10, Y6
  4893. VPSRLD $0x07, Y6, Y8
  4894. VPSLLD $0x19, Y6, Y6
  4895. VPOR Y6, Y8, Y6
  4896. VPADDD Y3, Y7, Y3
  4897. VPADDD 416(SP), Y3, Y3
  4898. VPXOR Y15, Y3, Y15
  4899. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4900. VPADDD Y11, Y15, Y11
  4901. VPXOR Y7, Y11, Y7
  4902. VPSRLD $0x0c, Y7, Y8
  4903. VPSLLD $0x14, Y7, Y7
  4904. VPOR Y7, Y8, Y7
  4905. VPADDD Y3, Y7, Y3
  4906. VPADDD 480(SP), Y3, Y3
  4907. VPXOR Y15, Y3, Y15
  4908. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4909. VPADDD Y11, Y15, Y11
  4910. VPXOR Y7, Y11, Y7
  4911. VPSRLD $0x07, Y7, Y8
  4912. VPSLLD $0x19, Y7, Y7
  4913. VPOR Y7, Y8, Y7
  4914. VPADDD Y0, Y5, Y0
  4915. VPADDD 128(SP), Y0, Y0
  4916. VPXOR Y15, Y0, Y15
  4917. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  4918. VPADDD Y10, Y15, Y10
  4919. VPXOR Y5, Y10, Y5
  4920. VPSRLD $0x0c, Y5, Y8
  4921. VPSLLD $0x14, Y5, Y5
  4922. VPOR Y5, Y8, Y5
  4923. VPADDD Y0, Y5, Y0
  4924. VPADDD (SP), Y0, Y0
  4925. VPXOR Y15, Y0, Y15
  4926. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  4927. VPADDD Y10, Y15, Y10
  4928. VPXOR Y5, Y10, Y5
  4929. VPSRLD $0x07, Y5, Y8
  4930. VPSLLD $0x19, Y5, Y5
  4931. VPOR Y5, Y8, Y5
  4932. VPADDD Y1, Y6, Y1
  4933. VPADDD 352(SP), Y1, Y1
  4934. VPXOR Y12, Y1, Y12
  4935. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4936. VPADDD Y11, Y12, Y11
  4937. VPXOR Y6, Y11, Y6
  4938. VPSRLD $0x0c, Y6, Y8
  4939. VPSLLD $0x14, Y6, Y6
  4940. VPOR Y6, Y8, Y6
  4941. VPADDD Y1, Y6, Y1
  4942. VPADDD 64(SP), Y1, Y1
  4943. VPXOR Y12, Y1, Y12
  4944. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  4945. VPADDD Y11, Y12, Y11
  4946. VPXOR Y6, Y11, Y6
  4947. VPSRLD $0x07, Y6, Y8
  4948. VPSLLD $0x19, Y6, Y6
  4949. VPOR Y6, Y8, Y6
  4950. VPADDD Y2, Y7, Y2
  4951. VPADDD 160(SP), Y2, Y2
  4952. VPXOR Y13, Y2, Y13
  4953. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  4954. VMOVDQU 512(SP), Y8
  4955. VPADDD Y8, Y13, Y8
  4956. VPXOR Y7, Y8, Y7
  4957. VMOVDQU Y8, 512(SP)
  4958. VPSRLD $0x0c, Y7, Y8
  4959. VPSLLD $0x14, Y7, Y7
  4960. VPOR Y7, Y8, Y7
  4961. VPADDD Y2, Y7, Y2
  4962. VPADDD 256(SP), Y2, Y2
  4963. VPXOR Y13, Y2, Y13
  4964. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  4965. VMOVDQU 512(SP), Y8
  4966. VPADDD Y8, Y13, Y8
  4967. VPXOR Y7, Y8, Y7
  4968. VMOVDQU Y8, 512(SP)
  4969. VPSRLD $0x07, Y7, Y8
  4970. VPSLLD $0x19, Y7, Y7
  4971. VPOR Y7, Y8, Y7
  4972. VPADDD Y3, Y4, Y3
  4973. VPADDD 32(SP), Y3, Y3
  4974. VPXOR Y14, Y3, Y14
  4975. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  4976. VPADDD Y9, Y14, Y9
  4977. VPXOR Y4, Y9, Y4
  4978. VPSRLD $0x0c, Y4, Y8
  4979. VPSLLD $0x14, Y4, Y4
  4980. VPOR Y4, Y8, Y4
  4981. VPADDD Y3, Y4, Y3
  4982. VPADDD 192(SP), Y3, Y3
  4983. VPXOR Y14, Y3, Y14
  4984. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  4985. VPADDD Y9, Y14, Y9
  4986. VPXOR Y4, Y9, Y4
  4987. VPSRLD $0x07, Y4, Y8
  4988. VPSLLD $0x19, Y4, Y4
  4989. VPOR Y4, Y8, Y4
  4990. // Round 5
  4991. VPADDD Y0, Y4, Y0
  4992. VPADDD 384(SP), Y0, Y0
  4993. VPXOR Y12, Y0, Y12
  4994. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  4995. VMOVDQU 512(SP), Y8
  4996. VPADDD Y8, Y12, Y8
  4997. VPXOR Y4, Y8, Y4
  4998. VMOVDQU Y8, 512(SP)
  4999. VPSRLD $0x0c, Y4, Y8
  5000. VPSLLD $0x14, Y4, Y4
  5001. VPOR Y4, Y8, Y4
  5002. VPADDD Y0, Y4, Y0
  5003. VPADDD 416(SP), Y0, Y0
  5004. VPXOR Y12, Y0, Y12
  5005. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5006. VMOVDQU 512(SP), Y8
  5007. VPADDD Y8, Y12, Y8
  5008. VPXOR Y4, Y8, Y4
  5009. VMOVDQU Y8, 512(SP)
  5010. VPSRLD $0x07, Y4, Y8
  5011. VPSLLD $0x19, Y4, Y4
  5012. VPOR Y4, Y8, Y4
  5013. VPADDD Y1, Y5, Y1
  5014. VPADDD 288(SP), Y1, Y1
  5015. VPXOR Y13, Y1, Y13
  5016. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5017. VPADDD Y9, Y13, Y9
  5018. VPXOR Y5, Y9, Y5
  5019. VPSRLD $0x0c, Y5, Y8
  5020. VPSLLD $0x14, Y5, Y5
  5021. VPOR Y5, Y8, Y5
  5022. VPADDD Y1, Y5, Y1
  5023. VPADDD 352(SP), Y1, Y1
  5024. VPXOR Y13, Y1, Y13
  5025. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5026. VPADDD Y9, Y13, Y9
  5027. VPXOR Y5, Y9, Y5
  5028. VPSRLD $0x07, Y5, Y8
  5029. VPSLLD $0x19, Y5, Y5
  5030. VPOR Y5, Y8, Y5
  5031. VPADDD Y2, Y6, Y2
  5032. VPADDD 480(SP), Y2, Y2
  5033. VPXOR Y14, Y2, Y14
  5034. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5035. VPADDD Y10, Y14, Y10
  5036. VPXOR Y6, Y10, Y6
  5037. VPSRLD $0x0c, Y6, Y8
  5038. VPSLLD $0x14, Y6, Y6
  5039. VPOR Y6, Y8, Y6
  5040. VPADDD Y2, Y6, Y2
  5041. VPADDD 320(SP), Y2, Y2
  5042. VPXOR Y14, Y2, Y14
  5043. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5044. VPADDD Y10, Y14, Y10
  5045. VPXOR Y6, Y10, Y6
  5046. VPSRLD $0x07, Y6, Y8
  5047. VPSLLD $0x19, Y6, Y6
  5048. VPOR Y6, Y8, Y6
  5049. VPADDD Y3, Y7, Y3
  5050. VPADDD 448(SP), Y3, Y3
  5051. VPXOR Y15, Y3, Y15
  5052. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5053. VPADDD Y11, Y15, Y11
  5054. VPXOR Y7, Y11, Y7
  5055. VPSRLD $0x0c, Y7, Y8
  5056. VPSLLD $0x14, Y7, Y7
  5057. VPOR Y7, Y8, Y7
  5058. VPADDD Y3, Y7, Y3
  5059. VPADDD 256(SP), Y3, Y3
  5060. VPXOR Y15, Y3, Y15
  5061. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5062. VPADDD Y11, Y15, Y11
  5063. VPXOR Y7, Y11, Y7
  5064. VPSRLD $0x07, Y7, Y8
  5065. VPSLLD $0x19, Y7, Y7
  5066. VPOR Y7, Y8, Y7
  5067. VPADDD Y0, Y5, Y0
  5068. VPADDD 224(SP), Y0, Y0
  5069. VPXOR Y15, Y0, Y15
  5070. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5071. VPADDD Y10, Y15, Y10
  5072. VPXOR Y5, Y10, Y5
  5073. VPSRLD $0x0c, Y5, Y8
  5074. VPSLLD $0x14, Y5, Y5
  5075. VPOR Y5, Y8, Y5
  5076. VPADDD Y0, Y5, Y0
  5077. VPADDD 64(SP), Y0, Y0
  5078. VPXOR Y15, Y0, Y15
  5079. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5080. VPADDD Y10, Y15, Y10
  5081. VPXOR Y5, Y10, Y5
  5082. VPSRLD $0x07, Y5, Y8
  5083. VPSLLD $0x19, Y5, Y5
  5084. VPOR Y5, Y8, Y5
  5085. VPADDD Y1, Y6, Y1
  5086. VPADDD 160(SP), Y1, Y1
  5087. VPXOR Y12, Y1, Y12
  5088. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5089. VPADDD Y11, Y12, Y11
  5090. VPXOR Y6, Y11, Y6
  5091. VPSRLD $0x0c, Y6, Y8
  5092. VPSLLD $0x14, Y6, Y6
  5093. VPOR Y6, Y8, Y6
  5094. VPADDD Y1, Y6, Y1
  5095. VPADDD 96(SP), Y1, Y1
  5096. VPXOR Y12, Y1, Y12
  5097. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5098. VPADDD Y11, Y12, Y11
  5099. VPXOR Y6, Y11, Y6
  5100. VPSRLD $0x07, Y6, Y8
  5101. VPSLLD $0x19, Y6, Y6
  5102. VPOR Y6, Y8, Y6
  5103. VPADDD Y2, Y7, Y2
  5104. VPADDD (SP), Y2, Y2
  5105. VPXOR Y13, Y2, Y13
  5106. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5107. VMOVDQU 512(SP), Y8
  5108. VPADDD Y8, Y13, Y8
  5109. VPXOR Y7, Y8, Y7
  5110. VMOVDQU Y8, 512(SP)
  5111. VPSRLD $0x0c, Y7, Y8
  5112. VPSLLD $0x14, Y7, Y7
  5113. VPOR Y7, Y8, Y7
  5114. VPADDD Y2, Y7, Y2
  5115. VPADDD 32(SP), Y2, Y2
  5116. VPXOR Y13, Y2, Y13
  5117. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5118. VMOVDQU 512(SP), Y8
  5119. VPADDD Y8, Y13, Y8
  5120. VPXOR Y7, Y8, Y7
  5121. VMOVDQU Y8, 512(SP)
  5122. VPSRLD $0x07, Y7, Y8
  5123. VPSLLD $0x19, Y7, Y7
  5124. VPOR Y7, Y8, Y7
  5125. VPADDD Y3, Y4, Y3
  5126. VPADDD 192(SP), Y3, Y3
  5127. VPXOR Y14, Y3, Y14
  5128. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5129. VPADDD Y9, Y14, Y9
  5130. VPXOR Y4, Y9, Y4
  5131. VPSRLD $0x0c, Y4, Y8
  5132. VPSLLD $0x14, Y4, Y4
  5133. VPOR Y4, Y8, Y4
  5134. VPADDD Y3, Y4, Y3
  5135. VPADDD 128(SP), Y3, Y3
  5136. VPXOR Y14, Y3, Y14
  5137. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5138. VPADDD Y9, Y14, Y9
  5139. VPXOR Y4, Y9, Y4
  5140. VPSRLD $0x07, Y4, Y8
  5141. VPSLLD $0x19, Y4, Y4
  5142. VPOR Y4, Y8, Y4
  5143. // Round 6
  5144. VPADDD Y0, Y4, Y0
  5145. VPADDD 288(SP), Y0, Y0
  5146. VPXOR Y12, Y0, Y12
  5147. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5148. VMOVDQU 512(SP), Y8
  5149. VPADDD Y8, Y12, Y8
  5150. VPXOR Y4, Y8, Y4
  5151. VMOVDQU Y8, 512(SP)
  5152. VPSRLD $0x0c, Y4, Y8
  5153. VPSLLD $0x14, Y4, Y4
  5154. VPOR Y4, Y8, Y4
  5155. VPADDD Y0, Y4, Y0
  5156. VPADDD 448(SP), Y0, Y0
  5157. VPXOR Y12, Y0, Y12
  5158. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5159. VMOVDQU 512(SP), Y8
  5160. VPADDD Y8, Y12, Y8
  5161. VPXOR Y4, Y8, Y4
  5162. VMOVDQU Y8, 512(SP)
  5163. VPSRLD $0x07, Y4, Y8
  5164. VPSLLD $0x19, Y4, Y4
  5165. VPOR Y4, Y8, Y4
  5166. VPADDD Y1, Y5, Y1
  5167. VPADDD 352(SP), Y1, Y1
  5168. VPXOR Y13, Y1, Y13
  5169. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5170. VPADDD Y9, Y13, Y9
  5171. VPXOR Y5, Y9, Y5
  5172. VPSRLD $0x0c, Y5, Y8
  5173. VPSLLD $0x14, Y5, Y5
  5174. VPOR Y5, Y8, Y5
  5175. VPADDD Y1, Y5, Y1
  5176. VPADDD 160(SP), Y1, Y1
  5177. VPXOR Y13, Y1, Y13
  5178. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5179. VPADDD Y9, Y13, Y9
  5180. VPXOR Y5, Y9, Y5
  5181. VPSRLD $0x07, Y5, Y8
  5182. VPSLLD $0x19, Y5, Y5
  5183. VPOR Y5, Y8, Y5
  5184. VPADDD Y2, Y6, Y2
  5185. VPADDD 256(SP), Y2, Y2
  5186. VPXOR Y14, Y2, Y14
  5187. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5188. VPADDD Y10, Y14, Y10
  5189. VPXOR Y6, Y10, Y6
  5190. VPSRLD $0x0c, Y6, Y8
  5191. VPSLLD $0x14, Y6, Y6
  5192. VPOR Y6, Y8, Y6
  5193. VPADDD Y2, Y6, Y2
  5194. VPADDD 384(SP), Y2, Y2
  5195. VPXOR Y14, Y2, Y14
  5196. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5197. VPADDD Y10, Y14, Y10
  5198. VPXOR Y6, Y10, Y6
  5199. VPSRLD $0x07, Y6, Y8
  5200. VPSLLD $0x19, Y6, Y6
  5201. VPOR Y6, Y8, Y6
  5202. VPADDD Y3, Y7, Y3
  5203. VPADDD 480(SP), Y3, Y3
  5204. VPXOR Y15, Y3, Y15
  5205. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5206. VPADDD Y11, Y15, Y11
  5207. VPXOR Y7, Y11, Y7
  5208. VPSRLD $0x0c, Y7, Y8
  5209. VPSLLD $0x14, Y7, Y7
  5210. VPOR Y7, Y8, Y7
  5211. VPADDD Y3, Y7, Y3
  5212. VPADDD 32(SP), Y3, Y3
  5213. VPXOR Y15, Y3, Y15
  5214. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5215. VPADDD Y11, Y15, Y11
  5216. VPXOR Y7, Y11, Y7
  5217. VPSRLD $0x07, Y7, Y8
  5218. VPSLLD $0x19, Y7, Y7
  5219. VPOR Y7, Y8, Y7
  5220. VPADDD Y0, Y5, Y0
  5221. VPADDD 416(SP), Y0, Y0
  5222. VPXOR Y15, Y0, Y15
  5223. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5224. VPADDD Y10, Y15, Y10
  5225. VPXOR Y5, Y10, Y5
  5226. VPSRLD $0x0c, Y5, Y8
  5227. VPSLLD $0x14, Y5, Y5
  5228. VPOR Y5, Y8, Y5
  5229. VPADDD Y0, Y5, Y0
  5230. VPADDD 96(SP), Y0, Y0
  5231. VPXOR Y15, Y0, Y15
  5232. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5233. VPADDD Y10, Y15, Y10
  5234. VPXOR Y5, Y10, Y5
  5235. VPSRLD $0x07, Y5, Y8
  5236. VPSLLD $0x19, Y5, Y5
  5237. VPOR Y5, Y8, Y5
  5238. VPADDD Y1, Y6, Y1
  5239. VPADDD (SP), Y1, Y1
  5240. VPXOR Y12, Y1, Y12
  5241. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5242. VPADDD Y11, Y12, Y11
  5243. VPXOR Y6, Y11, Y6
  5244. VPSRLD $0x0c, Y6, Y8
  5245. VPSLLD $0x14, Y6, Y6
  5246. VPOR Y6, Y8, Y6
  5247. VPADDD Y1, Y6, Y1
  5248. VPADDD 320(SP), Y1, Y1
  5249. VPXOR Y12, Y1, Y12
  5250. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5251. VPADDD Y11, Y12, Y11
  5252. VPXOR Y6, Y11, Y6
  5253. VPSRLD $0x07, Y6, Y8
  5254. VPSLLD $0x19, Y6, Y6
  5255. VPOR Y6, Y8, Y6
  5256. VPADDD Y2, Y7, Y2
  5257. VPADDD 64(SP), Y2, Y2
  5258. VPXOR Y13, Y2, Y13
  5259. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5260. VMOVDQU 512(SP), Y8
  5261. VPADDD Y8, Y13, Y8
  5262. VPXOR Y7, Y8, Y7
  5263. VMOVDQU Y8, 512(SP)
  5264. VPSRLD $0x0c, Y7, Y8
  5265. VPSLLD $0x14, Y7, Y7
  5266. VPOR Y7, Y8, Y7
  5267. VPADDD Y2, Y7, Y2
  5268. VPADDD 192(SP), Y2, Y2
  5269. VPXOR Y13, Y2, Y13
  5270. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5271. VMOVDQU 512(SP), Y8
  5272. VPADDD Y8, Y13, Y8
  5273. VPXOR Y7, Y8, Y7
  5274. VMOVDQU Y8, 512(SP)
  5275. VPSRLD $0x07, Y7, Y8
  5276. VPSLLD $0x19, Y7, Y7
  5277. VPOR Y7, Y8, Y7
  5278. VPADDD Y3, Y4, Y3
  5279. VPADDD 128(SP), Y3, Y3
  5280. VPXOR Y14, Y3, Y14
  5281. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5282. VPADDD Y9, Y14, Y9
  5283. VPXOR Y4, Y9, Y4
  5284. VPSRLD $0x0c, Y4, Y8
  5285. VPSLLD $0x14, Y4, Y4
  5286. VPOR Y4, Y8, Y4
  5287. VPADDD Y3, Y4, Y3
  5288. VPADDD 224(SP), Y3, Y3
  5289. VPXOR Y14, Y3, Y14
  5290. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5291. VPADDD Y9, Y14, Y9
  5292. VPXOR Y4, Y9, Y4
  5293. VPSRLD $0x07, Y4, Y8
  5294. VPSLLD $0x19, Y4, Y4
  5295. VPOR Y4, Y8, Y4
  5296. // Round 7
  5297. VPADDD Y0, Y4, Y0
  5298. VPADDD 352(SP), Y0, Y0
  5299. VPXOR Y12, Y0, Y12
  5300. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5301. VMOVDQU 512(SP), Y8
  5302. VPADDD Y8, Y12, Y8
  5303. VPXOR Y4, Y8, Y4
  5304. VMOVDQU Y8, 512(SP)
  5305. VPSRLD $0x0c, Y4, Y8
  5306. VPSLLD $0x14, Y4, Y4
  5307. VPOR Y4, Y8, Y4
  5308. VPADDD Y0, Y4, Y0
  5309. VPADDD 480(SP), Y0, Y0
  5310. VPXOR Y12, Y0, Y12
  5311. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5312. VMOVDQU 512(SP), Y8
  5313. VPADDD Y8, Y12, Y8
  5314. VPXOR Y4, Y8, Y4
  5315. VMOVDQU Y8, 512(SP)
  5316. VPSRLD $0x07, Y4, Y8
  5317. VPSLLD $0x19, Y4, Y4
  5318. VPOR Y4, Y8, Y4
  5319. VPADDD Y1, Y5, Y1
  5320. VPADDD 160(SP), Y1, Y1
  5321. VPXOR Y13, Y1, Y13
  5322. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5323. VPADDD Y9, Y13, Y9
  5324. VPXOR Y5, Y9, Y5
  5325. VPSRLD $0x0c, Y5, Y8
  5326. VPSLLD $0x14, Y5, Y5
  5327. VPOR Y5, Y8, Y5
  5328. VPADDD Y1, Y5, Y1
  5329. VPADDD (SP), Y1, Y1
  5330. VPXOR Y13, Y1, Y13
  5331. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5332. VPADDD Y9, Y13, Y9
  5333. VPXOR Y5, Y9, Y5
  5334. VPSRLD $0x07, Y5, Y8
  5335. VPSLLD $0x19, Y5, Y5
  5336. VPOR Y5, Y8, Y5
  5337. VPADDD Y2, Y6, Y2
  5338. VPADDD 32(SP), Y2, Y2
  5339. VPXOR Y14, Y2, Y14
  5340. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5341. VPADDD Y10, Y14, Y10
  5342. VPXOR Y6, Y10, Y6
  5343. VPSRLD $0x0c, Y6, Y8
  5344. VPSLLD $0x14, Y6, Y6
  5345. VPOR Y6, Y8, Y6
  5346. VPADDD Y2, Y6, Y2
  5347. VPADDD 288(SP), Y2, Y2
  5348. VPXOR Y14, Y2, Y14
  5349. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5350. VPADDD Y10, Y14, Y10
  5351. VPXOR Y6, Y10, Y6
  5352. VPSRLD $0x07, Y6, Y8
  5353. VPSLLD $0x19, Y6, Y6
  5354. VPOR Y6, Y8, Y6
  5355. VPADDD Y3, Y7, Y3
  5356. VPADDD 256(SP), Y3, Y3
  5357. VPXOR Y15, Y3, Y15
  5358. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5359. VPADDD Y11, Y15, Y11
  5360. VPXOR Y7, Y11, Y7
  5361. VPSRLD $0x0c, Y7, Y8
  5362. VPSLLD $0x14, Y7, Y7
  5363. VPOR Y7, Y8, Y7
  5364. VPADDD Y3, Y7, Y3
  5365. VPADDD 192(SP), Y3, Y3
  5366. VPXOR Y15, Y3, Y15
  5367. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5368. VPADDD Y11, Y15, Y11
  5369. VPXOR Y7, Y11, Y7
  5370. VPSRLD $0x07, Y7, Y8
  5371. VPSLLD $0x19, Y7, Y7
  5372. VPOR Y7, Y8, Y7
  5373. VPADDD Y0, Y5, Y0
  5374. VPADDD 448(SP), Y0, Y0
  5375. VPXOR Y15, Y0, Y15
  5376. VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
  5377. VPADDD Y10, Y15, Y10
  5378. VPXOR Y5, Y10, Y5
  5379. VPSRLD $0x0c, Y5, Y8
  5380. VPSLLD $0x14, Y5, Y5
  5381. VPOR Y5, Y8, Y5
  5382. VPADDD Y0, Y5, Y0
  5383. VPADDD 320(SP), Y0, Y0
  5384. VPXOR Y15, Y0, Y15
  5385. VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
  5386. VPADDD Y10, Y15, Y10
  5387. VPXOR Y5, Y10, Y5
  5388. VPSRLD $0x07, Y5, Y8
  5389. VPSLLD $0x19, Y5, Y5
  5390. VPOR Y5, Y8, Y5
  5391. VPADDD Y1, Y6, Y1
  5392. VPADDD 64(SP), Y1, Y1
  5393. VPXOR Y12, Y1, Y12
  5394. VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
  5395. VPADDD Y11, Y12, Y11
  5396. VPXOR Y6, Y11, Y6
  5397. VPSRLD $0x0c, Y6, Y8
  5398. VPSLLD $0x14, Y6, Y6
  5399. VPOR Y6, Y8, Y6
  5400. VPADDD Y1, Y6, Y1
  5401. VPADDD 384(SP), Y1, Y1
  5402. VPXOR Y12, Y1, Y12
  5403. VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
  5404. VPADDD Y11, Y12, Y11
  5405. VPXOR Y6, Y11, Y6
  5406. VPSRLD $0x07, Y6, Y8
  5407. VPSLLD $0x19, Y6, Y6
  5408. VPOR Y6, Y8, Y6
  5409. VPADDD Y2, Y7, Y2
  5410. VPADDD 96(SP), Y2, Y2
  5411. VPXOR Y13, Y2, Y13
  5412. VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
  5413. VMOVDQU 512(SP), Y8
  5414. VPADDD Y8, Y13, Y8
  5415. VPXOR Y7, Y8, Y7
  5416. VMOVDQU Y8, 512(SP)
  5417. VPSRLD $0x0c, Y7, Y8
  5418. VPSLLD $0x14, Y7, Y7
  5419. VPOR Y7, Y8, Y7
  5420. VPADDD Y2, Y7, Y2
  5421. VPADDD 128(SP), Y2, Y2
  5422. VPXOR Y13, Y2, Y13
  5423. VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
  5424. VMOVDQU 512(SP), Y8
  5425. VPADDD Y8, Y13, Y8
  5426. VPXOR Y7, Y8, Y7
  5427. VMOVDQU Y8, 512(SP)
  5428. VPSRLD $0x07, Y7, Y8
  5429. VPSLLD $0x19, Y7, Y7
  5430. VPOR Y7, Y8, Y7
  5431. VPADDD Y3, Y4, Y3
  5432. VPADDD 224(SP), Y3, Y3
  5433. VPXOR Y14, Y3, Y14
  5434. VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
  5435. VPADDD Y9, Y14, Y9
  5436. VPXOR Y4, Y9, Y4
  5437. VPSRLD $0x0c, Y4, Y8
  5438. VPSLLD $0x14, Y4, Y4
  5439. VPOR Y4, Y8, Y4
  5440. VPADDD Y3, Y4, Y3
  5441. VPADDD 416(SP), Y3, Y3
  5442. VPXOR Y14, Y3, Y14
  5443. VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
  5444. VPADDD Y9, Y14, Y9
  5445. VPXOR Y4, Y9, Y4
  5446. VPSRLD $0x07, Y4, Y8
  5447. VPSLLD $0x19, Y4, Y4
  5448. VPOR Y4, Y8, Y4
  5449. VMOVDQU 512(SP), Y8
  5450. // Finalize CVs
  5451. VPXOR Y0, Y8, Y0
  5452. VPXOR Y1, Y9, Y1
  5453. VPXOR Y2, Y10, Y2
  5454. VPXOR Y3, Y11, Y3
  5455. VPXOR Y4, Y12, Y4
  5456. VPXOR Y5, Y13, Y5
  5457. VPXOR Y6, Y14, Y6
  5458. VPXOR Y7, Y15, Y7
  5459. VPUNPCKLDQ Y1, Y0, Y8
  5460. VPUNPCKHDQ Y1, Y0, Y9
  5461. VPUNPCKLDQ Y3, Y2, Y10
  5462. VPUNPCKHDQ Y3, Y2, Y11
  5463. VPUNPCKLDQ Y5, Y4, Y12
  5464. VPUNPCKHDQ Y5, Y4, Y13
  5465. VPUNPCKLDQ Y7, Y6, Y14
  5466. VPUNPCKHDQ Y7, Y6, Y15
  5467. VPUNPCKLQDQ Y10, Y8, Y0
  5468. VPUNPCKHQDQ Y10, Y8, Y1
  5469. VPUNPCKLQDQ Y11, Y9, Y2
  5470. VPUNPCKHQDQ Y11, Y9, Y3
  5471. VPUNPCKLQDQ Y14, Y12, Y4
  5472. VPUNPCKHQDQ Y14, Y12, Y5
  5473. VPUNPCKLQDQ Y15, Y13, Y6
  5474. VPUNPCKHQDQ Y15, Y13, Y7
  5475. VPERM2I128 $0x20, Y4, Y0, Y8
  5476. VPERM2I128 $0x31, Y4, Y0, Y12
  5477. VPERM2I128 $0x20, Y5, Y1, Y9
  5478. VPERM2I128 $0x31, Y5, Y1, Y13
  5479. VPERM2I128 $0x20, Y6, Y2, Y10
  5480. VPERM2I128 $0x31, Y6, Y2, Y14
  5481. VPERM2I128 $0x20, Y7, Y3, Y11
  5482. VPERM2I128 $0x31, Y7, Y3, Y15
  5483. VMOVDQU Y8, (AX)
  5484. VMOVDQU Y9, 32(AX)
  5485. VMOVDQU Y10, 64(AX)
  5486. VMOVDQU Y11, 96(AX)
  5487. VMOVDQU Y12, 128(AX)
  5488. VMOVDQU Y13, 160(AX)
  5489. VMOVDQU Y14, 192(AX)
  5490. VMOVDQU Y15, 224(AX)
  5491. RET