seqdec_amd64.s 80 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079
  1. // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
  2. //go:build !appengine && !noasm && gc && !noasm
  3. // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  4. // Requires: CMOV
  5. TEXT ·sequenceDecs_decode_amd64(SB), $8-32
  6. MOVQ br+8(FP), AX
  7. MOVQ 32(AX), DX
  8. MOVBQZX 40(AX), BX
  9. MOVQ 24(AX), SI
  10. MOVQ (AX), AX
  11. ADDQ SI, AX
  12. MOVQ AX, (SP)
  13. MOVQ ctx+16(FP), AX
  14. MOVQ 72(AX), DI
  15. MOVQ 80(AX), R8
  16. MOVQ 88(AX), R9
  17. MOVQ 104(AX), R10
  18. MOVQ s+0(FP), AX
  19. MOVQ 144(AX), R11
  20. MOVQ 152(AX), R12
  21. MOVQ 160(AX), R13
  22. sequenceDecs_decode_amd64_main_loop:
  23. MOVQ (SP), R14
  24. // Fill bitreader to have enough for the offset and match length.
  25. CMPQ SI, $0x08
  26. JL sequenceDecs_decode_amd64_fill_byte_by_byte
  27. MOVQ BX, AX
  28. SHRQ $0x03, AX
  29. SUBQ AX, R14
  30. MOVQ (R14), DX
  31. SUBQ AX, SI
  32. ANDQ $0x07, BX
  33. JMP sequenceDecs_decode_amd64_fill_end
  34. sequenceDecs_decode_amd64_fill_byte_by_byte:
  35. CMPQ SI, $0x00
  36. JLE sequenceDecs_decode_amd64_fill_end
  37. CMPQ BX, $0x07
  38. JLE sequenceDecs_decode_amd64_fill_end
  39. SHLQ $0x08, DX
  40. SUBQ $0x01, R14
  41. SUBQ $0x01, SI
  42. SUBQ $0x08, BX
  43. MOVBQZX (R14), AX
  44. ORQ AX, DX
  45. JMP sequenceDecs_decode_amd64_fill_byte_by_byte
  46. sequenceDecs_decode_amd64_fill_end:
  47. // Update offset
  48. MOVQ R9, AX
  49. MOVQ BX, CX
  50. MOVQ DX, R15
  51. SHLQ CL, R15
  52. MOVB AH, CL
  53. SHRQ $0x20, AX
  54. TESTQ CX, CX
  55. JZ sequenceDecs_decode_amd64_of_update_zero
  56. ADDQ CX, BX
  57. CMPQ BX, $0x40
  58. JA sequenceDecs_decode_amd64_of_update_zero
  59. CMPQ CX, $0x40
  60. JAE sequenceDecs_decode_amd64_of_update_zero
  61. NEGQ CX
  62. SHRQ CL, R15
  63. ADDQ R15, AX
  64. sequenceDecs_decode_amd64_of_update_zero:
  65. MOVQ AX, 16(R10)
  66. // Update match length
  67. MOVQ R8, AX
  68. MOVQ BX, CX
  69. MOVQ DX, R15
  70. SHLQ CL, R15
  71. MOVB AH, CL
  72. SHRQ $0x20, AX
  73. TESTQ CX, CX
  74. JZ sequenceDecs_decode_amd64_ml_update_zero
  75. ADDQ CX, BX
  76. CMPQ BX, $0x40
  77. JA sequenceDecs_decode_amd64_ml_update_zero
  78. CMPQ CX, $0x40
  79. JAE sequenceDecs_decode_amd64_ml_update_zero
  80. NEGQ CX
  81. SHRQ CL, R15
  82. ADDQ R15, AX
  83. sequenceDecs_decode_amd64_ml_update_zero:
  84. MOVQ AX, 8(R10)
  85. // Fill bitreader to have enough for the remaining
  86. CMPQ SI, $0x08
  87. JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
  88. MOVQ BX, AX
  89. SHRQ $0x03, AX
  90. SUBQ AX, R14
  91. MOVQ (R14), DX
  92. SUBQ AX, SI
  93. ANDQ $0x07, BX
  94. JMP sequenceDecs_decode_amd64_fill_2_end
  95. sequenceDecs_decode_amd64_fill_2_byte_by_byte:
  96. CMPQ SI, $0x00
  97. JLE sequenceDecs_decode_amd64_fill_2_end
  98. CMPQ BX, $0x07
  99. JLE sequenceDecs_decode_amd64_fill_2_end
  100. SHLQ $0x08, DX
  101. SUBQ $0x01, R14
  102. SUBQ $0x01, SI
  103. SUBQ $0x08, BX
  104. MOVBQZX (R14), AX
  105. ORQ AX, DX
  106. JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
  107. sequenceDecs_decode_amd64_fill_2_end:
  108. // Update literal length
  109. MOVQ DI, AX
  110. MOVQ BX, CX
  111. MOVQ DX, R15
  112. SHLQ CL, R15
  113. MOVB AH, CL
  114. SHRQ $0x20, AX
  115. TESTQ CX, CX
  116. JZ sequenceDecs_decode_amd64_ll_update_zero
  117. ADDQ CX, BX
  118. CMPQ BX, $0x40
  119. JA sequenceDecs_decode_amd64_ll_update_zero
  120. CMPQ CX, $0x40
  121. JAE sequenceDecs_decode_amd64_ll_update_zero
  122. NEGQ CX
  123. SHRQ CL, R15
  124. ADDQ R15, AX
  125. sequenceDecs_decode_amd64_ll_update_zero:
  126. MOVQ AX, (R10)
  127. // Fill bitreader for state updates
  128. MOVQ R14, (SP)
  129. MOVQ R9, AX
  130. SHRQ $0x08, AX
  131. MOVBQZX AL, AX
  132. MOVQ ctx+16(FP), CX
  133. CMPQ 96(CX), $0x00
  134. JZ sequenceDecs_decode_amd64_skip_update
  135. // Update Literal Length State
  136. MOVBQZX DI, R14
  137. SHRQ $0x10, DI
  138. MOVWQZX DI, DI
  139. LEAQ (BX)(R14*1), CX
  140. MOVQ DX, R15
  141. MOVQ CX, BX
  142. ROLQ CL, R15
  143. MOVL $0x00000001, BP
  144. MOVB R14, CL
  145. SHLL CL, BP
  146. DECL BP
  147. ANDQ BP, R15
  148. ADDQ R15, DI
  149. // Load ctx.llTable
  150. MOVQ ctx+16(FP), CX
  151. MOVQ (CX), CX
  152. MOVQ (CX)(DI*8), DI
  153. // Update Match Length State
  154. MOVBQZX R8, R14
  155. SHRQ $0x10, R8
  156. MOVWQZX R8, R8
  157. LEAQ (BX)(R14*1), CX
  158. MOVQ DX, R15
  159. MOVQ CX, BX
  160. ROLQ CL, R15
  161. MOVL $0x00000001, BP
  162. MOVB R14, CL
  163. SHLL CL, BP
  164. DECL BP
  165. ANDQ BP, R15
  166. ADDQ R15, R8
  167. // Load ctx.mlTable
  168. MOVQ ctx+16(FP), CX
  169. MOVQ 24(CX), CX
  170. MOVQ (CX)(R8*8), R8
  171. // Update Offset State
  172. MOVBQZX R9, R14
  173. SHRQ $0x10, R9
  174. MOVWQZX R9, R9
  175. LEAQ (BX)(R14*1), CX
  176. MOVQ DX, R15
  177. MOVQ CX, BX
  178. ROLQ CL, R15
  179. MOVL $0x00000001, BP
  180. MOVB R14, CL
  181. SHLL CL, BP
  182. DECL BP
  183. ANDQ BP, R15
  184. ADDQ R15, R9
  185. // Load ctx.ofTable
  186. MOVQ ctx+16(FP), CX
  187. MOVQ 48(CX), CX
  188. MOVQ (CX)(R9*8), R9
  189. sequenceDecs_decode_amd64_skip_update:
  190. // Adjust offset
  191. MOVQ 16(R10), CX
  192. CMPQ AX, $0x01
  193. JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
  194. MOVQ R12, R13
  195. MOVQ R11, R12
  196. MOVQ CX, R11
  197. JMP sequenceDecs_decode_amd64_after_adjust
  198. sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
  199. CMPQ (R10), $0x00000000
  200. JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
  201. INCQ CX
  202. JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
  203. sequenceDecs_decode_amd64_adjust_offset_maybezero:
  204. TESTQ CX, CX
  205. JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
  206. MOVQ R11, CX
  207. JMP sequenceDecs_decode_amd64_after_adjust
  208. sequenceDecs_decode_amd64_adjust_offset_nonzero:
  209. CMPQ CX, $0x01
  210. JB sequenceDecs_decode_amd64_adjust_zero
  211. JEQ sequenceDecs_decode_amd64_adjust_one
  212. CMPQ CX, $0x02
  213. JA sequenceDecs_decode_amd64_adjust_three
  214. JMP sequenceDecs_decode_amd64_adjust_two
  215. sequenceDecs_decode_amd64_adjust_zero:
  216. MOVQ R11, AX
  217. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  218. sequenceDecs_decode_amd64_adjust_one:
  219. MOVQ R12, AX
  220. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  221. sequenceDecs_decode_amd64_adjust_two:
  222. MOVQ R13, AX
  223. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  224. sequenceDecs_decode_amd64_adjust_three:
  225. LEAQ -1(R11), AX
  226. sequenceDecs_decode_amd64_adjust_test_temp_valid:
  227. TESTQ AX, AX
  228. JNZ sequenceDecs_decode_amd64_adjust_temp_valid
  229. MOVQ $0x00000001, AX
  230. sequenceDecs_decode_amd64_adjust_temp_valid:
  231. CMPQ CX, $0x01
  232. CMOVQNE R12, R13
  233. MOVQ R11, R12
  234. MOVQ AX, R11
  235. MOVQ AX, CX
  236. sequenceDecs_decode_amd64_after_adjust:
  237. MOVQ CX, 16(R10)
  238. // Check values
  239. MOVQ 8(R10), AX
  240. MOVQ (R10), R14
  241. LEAQ (AX)(R14*1), R15
  242. MOVQ s+0(FP), BP
  243. ADDQ R15, 256(BP)
  244. MOVQ ctx+16(FP), R15
  245. SUBQ R14, 128(R15)
  246. JS error_not_enough_literals
  247. CMPQ AX, $0x00020002
  248. JA sequenceDecs_decode_amd64_error_match_len_too_big
  249. TESTQ CX, CX
  250. JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
  251. TESTQ AX, AX
  252. JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
  253. sequenceDecs_decode_amd64_match_len_ofs_ok:
  254. ADDQ $0x18, R10
  255. MOVQ ctx+16(FP), AX
  256. DECQ 96(AX)
  257. JNS sequenceDecs_decode_amd64_main_loop
  258. MOVQ s+0(FP), AX
  259. MOVQ R11, 144(AX)
  260. MOVQ R12, 152(AX)
  261. MOVQ R13, 160(AX)
  262. MOVQ br+8(FP), AX
  263. MOVQ DX, 32(AX)
  264. MOVB BL, 40(AX)
  265. MOVQ SI, 24(AX)
  266. // Return success
  267. MOVQ $0x00000000, ret+24(FP)
  268. RET
  269. // Return with match length error
  270. sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
  271. MOVQ $0x00000001, ret+24(FP)
  272. RET
  273. // Return with match too long error
  274. sequenceDecs_decode_amd64_error_match_len_too_big:
  275. MOVQ $0x00000002, ret+24(FP)
  276. RET
  277. // Return with match offset too long error
  278. MOVQ $0x00000003, ret+24(FP)
  279. RET
  280. // Return with not enough literals error
  281. error_not_enough_literals:
  282. MOVQ $0x00000004, ret+24(FP)
  283. RET
  284. // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  285. // Requires: CMOV
  286. TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
  287. MOVQ br+8(FP), AX
  288. MOVQ 32(AX), DX
  289. MOVBQZX 40(AX), BX
  290. MOVQ 24(AX), SI
  291. MOVQ (AX), AX
  292. ADDQ SI, AX
  293. MOVQ AX, (SP)
  294. MOVQ ctx+16(FP), AX
  295. MOVQ 72(AX), DI
  296. MOVQ 80(AX), R8
  297. MOVQ 88(AX), R9
  298. MOVQ 104(AX), R10
  299. MOVQ s+0(FP), AX
  300. MOVQ 144(AX), R11
  301. MOVQ 152(AX), R12
  302. MOVQ 160(AX), R13
  303. sequenceDecs_decode_56_amd64_main_loop:
  304. MOVQ (SP), R14
  305. // Fill bitreader to have enough for the offset and match length.
  306. CMPQ SI, $0x08
  307. JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
  308. MOVQ BX, AX
  309. SHRQ $0x03, AX
  310. SUBQ AX, R14
  311. MOVQ (R14), DX
  312. SUBQ AX, SI
  313. ANDQ $0x07, BX
  314. JMP sequenceDecs_decode_56_amd64_fill_end
  315. sequenceDecs_decode_56_amd64_fill_byte_by_byte:
  316. CMPQ SI, $0x00
  317. JLE sequenceDecs_decode_56_amd64_fill_end
  318. CMPQ BX, $0x07
  319. JLE sequenceDecs_decode_56_amd64_fill_end
  320. SHLQ $0x08, DX
  321. SUBQ $0x01, R14
  322. SUBQ $0x01, SI
  323. SUBQ $0x08, BX
  324. MOVBQZX (R14), AX
  325. ORQ AX, DX
  326. JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
  327. sequenceDecs_decode_56_amd64_fill_end:
  328. // Update offset
  329. MOVQ R9, AX
  330. MOVQ BX, CX
  331. MOVQ DX, R15
  332. SHLQ CL, R15
  333. MOVB AH, CL
  334. SHRQ $0x20, AX
  335. TESTQ CX, CX
  336. JZ sequenceDecs_decode_56_amd64_of_update_zero
  337. ADDQ CX, BX
  338. CMPQ BX, $0x40
  339. JA sequenceDecs_decode_56_amd64_of_update_zero
  340. CMPQ CX, $0x40
  341. JAE sequenceDecs_decode_56_amd64_of_update_zero
  342. NEGQ CX
  343. SHRQ CL, R15
  344. ADDQ R15, AX
  345. sequenceDecs_decode_56_amd64_of_update_zero:
  346. MOVQ AX, 16(R10)
  347. // Update match length
  348. MOVQ R8, AX
  349. MOVQ BX, CX
  350. MOVQ DX, R15
  351. SHLQ CL, R15
  352. MOVB AH, CL
  353. SHRQ $0x20, AX
  354. TESTQ CX, CX
  355. JZ sequenceDecs_decode_56_amd64_ml_update_zero
  356. ADDQ CX, BX
  357. CMPQ BX, $0x40
  358. JA sequenceDecs_decode_56_amd64_ml_update_zero
  359. CMPQ CX, $0x40
  360. JAE sequenceDecs_decode_56_amd64_ml_update_zero
  361. NEGQ CX
  362. SHRQ CL, R15
  363. ADDQ R15, AX
  364. sequenceDecs_decode_56_amd64_ml_update_zero:
  365. MOVQ AX, 8(R10)
  366. // Update literal length
  367. MOVQ DI, AX
  368. MOVQ BX, CX
  369. MOVQ DX, R15
  370. SHLQ CL, R15
  371. MOVB AH, CL
  372. SHRQ $0x20, AX
  373. TESTQ CX, CX
  374. JZ sequenceDecs_decode_56_amd64_ll_update_zero
  375. ADDQ CX, BX
  376. CMPQ BX, $0x40
  377. JA sequenceDecs_decode_56_amd64_ll_update_zero
  378. CMPQ CX, $0x40
  379. JAE sequenceDecs_decode_56_amd64_ll_update_zero
  380. NEGQ CX
  381. SHRQ CL, R15
  382. ADDQ R15, AX
  383. sequenceDecs_decode_56_amd64_ll_update_zero:
  384. MOVQ AX, (R10)
  385. // Fill bitreader for state updates
  386. MOVQ R14, (SP)
  387. MOVQ R9, AX
  388. SHRQ $0x08, AX
  389. MOVBQZX AL, AX
  390. MOVQ ctx+16(FP), CX
  391. CMPQ 96(CX), $0x00
  392. JZ sequenceDecs_decode_56_amd64_skip_update
  393. // Update Literal Length State
  394. MOVBQZX DI, R14
  395. SHRQ $0x10, DI
  396. MOVWQZX DI, DI
  397. LEAQ (BX)(R14*1), CX
  398. MOVQ DX, R15
  399. MOVQ CX, BX
  400. ROLQ CL, R15
  401. MOVL $0x00000001, BP
  402. MOVB R14, CL
  403. SHLL CL, BP
  404. DECL BP
  405. ANDQ BP, R15
  406. ADDQ R15, DI
  407. // Load ctx.llTable
  408. MOVQ ctx+16(FP), CX
  409. MOVQ (CX), CX
  410. MOVQ (CX)(DI*8), DI
  411. // Update Match Length State
  412. MOVBQZX R8, R14
  413. SHRQ $0x10, R8
  414. MOVWQZX R8, R8
  415. LEAQ (BX)(R14*1), CX
  416. MOVQ DX, R15
  417. MOVQ CX, BX
  418. ROLQ CL, R15
  419. MOVL $0x00000001, BP
  420. MOVB R14, CL
  421. SHLL CL, BP
  422. DECL BP
  423. ANDQ BP, R15
  424. ADDQ R15, R8
  425. // Load ctx.mlTable
  426. MOVQ ctx+16(FP), CX
  427. MOVQ 24(CX), CX
  428. MOVQ (CX)(R8*8), R8
  429. // Update Offset State
  430. MOVBQZX R9, R14
  431. SHRQ $0x10, R9
  432. MOVWQZX R9, R9
  433. LEAQ (BX)(R14*1), CX
  434. MOVQ DX, R15
  435. MOVQ CX, BX
  436. ROLQ CL, R15
  437. MOVL $0x00000001, BP
  438. MOVB R14, CL
  439. SHLL CL, BP
  440. DECL BP
  441. ANDQ BP, R15
  442. ADDQ R15, R9
  443. // Load ctx.ofTable
  444. MOVQ ctx+16(FP), CX
  445. MOVQ 48(CX), CX
  446. MOVQ (CX)(R9*8), R9
  447. sequenceDecs_decode_56_amd64_skip_update:
  448. // Adjust offset
  449. MOVQ 16(R10), CX
  450. CMPQ AX, $0x01
  451. JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
  452. MOVQ R12, R13
  453. MOVQ R11, R12
  454. MOVQ CX, R11
  455. JMP sequenceDecs_decode_56_amd64_after_adjust
  456. sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
  457. CMPQ (R10), $0x00000000
  458. JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
  459. INCQ CX
  460. JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
  461. sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
  462. TESTQ CX, CX
  463. JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
  464. MOVQ R11, CX
  465. JMP sequenceDecs_decode_56_amd64_after_adjust
  466. sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
  467. CMPQ CX, $0x01
  468. JB sequenceDecs_decode_56_amd64_adjust_zero
  469. JEQ sequenceDecs_decode_56_amd64_adjust_one
  470. CMPQ CX, $0x02
  471. JA sequenceDecs_decode_56_amd64_adjust_three
  472. JMP sequenceDecs_decode_56_amd64_adjust_two
  473. sequenceDecs_decode_56_amd64_adjust_zero:
  474. MOVQ R11, AX
  475. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  476. sequenceDecs_decode_56_amd64_adjust_one:
  477. MOVQ R12, AX
  478. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  479. sequenceDecs_decode_56_amd64_adjust_two:
  480. MOVQ R13, AX
  481. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  482. sequenceDecs_decode_56_amd64_adjust_three:
  483. LEAQ -1(R11), AX
  484. sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
  485. TESTQ AX, AX
  486. JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
  487. MOVQ $0x00000001, AX
  488. sequenceDecs_decode_56_amd64_adjust_temp_valid:
  489. CMPQ CX, $0x01
  490. CMOVQNE R12, R13
  491. MOVQ R11, R12
  492. MOVQ AX, R11
  493. MOVQ AX, CX
  494. sequenceDecs_decode_56_amd64_after_adjust:
  495. MOVQ CX, 16(R10)
  496. // Check values
  497. MOVQ 8(R10), AX
  498. MOVQ (R10), R14
  499. LEAQ (AX)(R14*1), R15
  500. MOVQ s+0(FP), BP
  501. ADDQ R15, 256(BP)
  502. MOVQ ctx+16(FP), R15
  503. SUBQ R14, 128(R15)
  504. JS error_not_enough_literals
  505. CMPQ AX, $0x00020002
  506. JA sequenceDecs_decode_56_amd64_error_match_len_too_big
  507. TESTQ CX, CX
  508. JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
  509. TESTQ AX, AX
  510. JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
  511. sequenceDecs_decode_56_amd64_match_len_ofs_ok:
  512. ADDQ $0x18, R10
  513. MOVQ ctx+16(FP), AX
  514. DECQ 96(AX)
  515. JNS sequenceDecs_decode_56_amd64_main_loop
  516. MOVQ s+0(FP), AX
  517. MOVQ R11, 144(AX)
  518. MOVQ R12, 152(AX)
  519. MOVQ R13, 160(AX)
  520. MOVQ br+8(FP), AX
  521. MOVQ DX, 32(AX)
  522. MOVB BL, 40(AX)
  523. MOVQ SI, 24(AX)
  524. // Return success
  525. MOVQ $0x00000000, ret+24(FP)
  526. RET
  527. // Return with match length error
  528. sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
  529. MOVQ $0x00000001, ret+24(FP)
  530. RET
  531. // Return with match too long error
  532. sequenceDecs_decode_56_amd64_error_match_len_too_big:
  533. MOVQ $0x00000002, ret+24(FP)
  534. RET
  535. // Return with match offset too long error
  536. MOVQ $0x00000003, ret+24(FP)
  537. RET
  538. // Return with not enough literals error
  539. error_not_enough_literals:
  540. MOVQ $0x00000004, ret+24(FP)
  541. RET
  542. // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  543. // Requires: BMI, BMI2, CMOV
  544. TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
  545. MOVQ br+8(FP), CX
  546. MOVQ 32(CX), AX
  547. MOVBQZX 40(CX), DX
  548. MOVQ 24(CX), BX
  549. MOVQ (CX), CX
  550. ADDQ BX, CX
  551. MOVQ CX, (SP)
  552. MOVQ ctx+16(FP), CX
  553. MOVQ 72(CX), SI
  554. MOVQ 80(CX), DI
  555. MOVQ 88(CX), R8
  556. MOVQ 104(CX), R9
  557. MOVQ s+0(FP), CX
  558. MOVQ 144(CX), R10
  559. MOVQ 152(CX), R11
  560. MOVQ 160(CX), R12
  561. sequenceDecs_decode_bmi2_main_loop:
  562. MOVQ (SP), R13
  563. // Fill bitreader to have enough for the offset and match length.
  564. CMPQ BX, $0x08
  565. JL sequenceDecs_decode_bmi2_fill_byte_by_byte
  566. MOVQ DX, CX
  567. SHRQ $0x03, CX
  568. SUBQ CX, R13
  569. MOVQ (R13), AX
  570. SUBQ CX, BX
  571. ANDQ $0x07, DX
  572. JMP sequenceDecs_decode_bmi2_fill_end
  573. sequenceDecs_decode_bmi2_fill_byte_by_byte:
  574. CMPQ BX, $0x00
  575. JLE sequenceDecs_decode_bmi2_fill_end
  576. CMPQ DX, $0x07
  577. JLE sequenceDecs_decode_bmi2_fill_end
  578. SHLQ $0x08, AX
  579. SUBQ $0x01, R13
  580. SUBQ $0x01, BX
  581. SUBQ $0x08, DX
  582. MOVBQZX (R13), CX
  583. ORQ CX, AX
  584. JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
  585. sequenceDecs_decode_bmi2_fill_end:
  586. // Update offset
  587. MOVQ $0x00000808, CX
  588. BEXTRQ CX, R8, R14
  589. MOVQ AX, R15
  590. LEAQ (DX)(R14*1), CX
  591. ROLQ CL, R15
  592. BZHIQ R14, R15, R15
  593. MOVQ CX, DX
  594. MOVQ R8, CX
  595. SHRQ $0x20, CX
  596. ADDQ R15, CX
  597. MOVQ CX, 16(R9)
  598. // Update match length
  599. MOVQ $0x00000808, CX
  600. BEXTRQ CX, DI, R14
  601. MOVQ AX, R15
  602. LEAQ (DX)(R14*1), CX
  603. ROLQ CL, R15
  604. BZHIQ R14, R15, R15
  605. MOVQ CX, DX
  606. MOVQ DI, CX
  607. SHRQ $0x20, CX
  608. ADDQ R15, CX
  609. MOVQ CX, 8(R9)
  610. // Fill bitreader to have enough for the remaining
  611. CMPQ BX, $0x08
  612. JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
  613. MOVQ DX, CX
  614. SHRQ $0x03, CX
  615. SUBQ CX, R13
  616. MOVQ (R13), AX
  617. SUBQ CX, BX
  618. ANDQ $0x07, DX
  619. JMP sequenceDecs_decode_bmi2_fill_2_end
  620. sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
  621. CMPQ BX, $0x00
  622. JLE sequenceDecs_decode_bmi2_fill_2_end
  623. CMPQ DX, $0x07
  624. JLE sequenceDecs_decode_bmi2_fill_2_end
  625. SHLQ $0x08, AX
  626. SUBQ $0x01, R13
  627. SUBQ $0x01, BX
  628. SUBQ $0x08, DX
  629. MOVBQZX (R13), CX
  630. ORQ CX, AX
  631. JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
  632. sequenceDecs_decode_bmi2_fill_2_end:
  633. // Update literal length
  634. MOVQ $0x00000808, CX
  635. BEXTRQ CX, SI, R14
  636. MOVQ AX, R15
  637. LEAQ (DX)(R14*1), CX
  638. ROLQ CL, R15
  639. BZHIQ R14, R15, R15
  640. MOVQ CX, DX
  641. MOVQ SI, CX
  642. SHRQ $0x20, CX
  643. ADDQ R15, CX
  644. MOVQ CX, (R9)
  645. // Fill bitreader for state updates
  646. MOVQ R13, (SP)
  647. MOVQ $0x00000808, CX
  648. BEXTRQ CX, R8, R13
  649. MOVQ ctx+16(FP), CX
  650. CMPQ 96(CX), $0x00
  651. JZ sequenceDecs_decode_bmi2_skip_update
  652. LEAQ (SI)(DI*1), R14
  653. ADDQ R8, R14
  654. MOVBQZX R14, R14
  655. LEAQ (DX)(R14*1), CX
  656. MOVQ AX, R15
  657. MOVQ CX, DX
  658. ROLQ CL, R15
  659. BZHIQ R14, R15, R15
  660. // Update Offset State
  661. BZHIQ R8, R15, CX
  662. SHRXQ R8, R15, R15
  663. MOVQ $0x00001010, R14
  664. BEXTRQ R14, R8, R8
  665. ADDQ CX, R8
  666. // Load ctx.ofTable
  667. MOVQ ctx+16(FP), CX
  668. MOVQ 48(CX), CX
  669. MOVQ (CX)(R8*8), R8
  670. // Update Match Length State
  671. BZHIQ DI, R15, CX
  672. SHRXQ DI, R15, R15
  673. MOVQ $0x00001010, R14
  674. BEXTRQ R14, DI, DI
  675. ADDQ CX, DI
  676. // Load ctx.mlTable
  677. MOVQ ctx+16(FP), CX
  678. MOVQ 24(CX), CX
  679. MOVQ (CX)(DI*8), DI
  680. // Update Literal Length State
  681. BZHIQ SI, R15, CX
  682. MOVQ $0x00001010, R14
  683. BEXTRQ R14, SI, SI
  684. ADDQ CX, SI
  685. // Load ctx.llTable
  686. MOVQ ctx+16(FP), CX
  687. MOVQ (CX), CX
  688. MOVQ (CX)(SI*8), SI
  689. sequenceDecs_decode_bmi2_skip_update:
  690. // Adjust offset
  691. MOVQ 16(R9), CX
  692. CMPQ R13, $0x01
  693. JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
  694. MOVQ R11, R12
  695. MOVQ R10, R11
  696. MOVQ CX, R10
  697. JMP sequenceDecs_decode_bmi2_after_adjust
  698. sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
  699. CMPQ (R9), $0x00000000
  700. JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
  701. INCQ CX
  702. JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
  703. sequenceDecs_decode_bmi2_adjust_offset_maybezero:
  704. TESTQ CX, CX
  705. JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
  706. MOVQ R10, CX
  707. JMP sequenceDecs_decode_bmi2_after_adjust
  708. sequenceDecs_decode_bmi2_adjust_offset_nonzero:
  709. CMPQ CX, $0x01
  710. JB sequenceDecs_decode_bmi2_adjust_zero
  711. JEQ sequenceDecs_decode_bmi2_adjust_one
  712. CMPQ CX, $0x02
  713. JA sequenceDecs_decode_bmi2_adjust_three
  714. JMP sequenceDecs_decode_bmi2_adjust_two
  715. sequenceDecs_decode_bmi2_adjust_zero:
  716. MOVQ R10, R13
  717. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  718. sequenceDecs_decode_bmi2_adjust_one:
  719. MOVQ R11, R13
  720. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  721. sequenceDecs_decode_bmi2_adjust_two:
  722. MOVQ R12, R13
  723. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  724. sequenceDecs_decode_bmi2_adjust_three:
  725. LEAQ -1(R10), R13
  726. sequenceDecs_decode_bmi2_adjust_test_temp_valid:
  727. TESTQ R13, R13
  728. JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
  729. MOVQ $0x00000001, R13
  730. sequenceDecs_decode_bmi2_adjust_temp_valid:
  731. CMPQ CX, $0x01
  732. CMOVQNE R11, R12
  733. MOVQ R10, R11
  734. MOVQ R13, R10
  735. MOVQ R13, CX
  736. sequenceDecs_decode_bmi2_after_adjust:
  737. MOVQ CX, 16(R9)
  738. // Check values
  739. MOVQ 8(R9), R13
  740. MOVQ (R9), R14
  741. LEAQ (R13)(R14*1), R15
  742. MOVQ s+0(FP), BP
  743. ADDQ R15, 256(BP)
  744. MOVQ ctx+16(FP), R15
  745. SUBQ R14, 128(R15)
  746. JS error_not_enough_literals
  747. CMPQ R13, $0x00020002
  748. JA sequenceDecs_decode_bmi2_error_match_len_too_big
  749. TESTQ CX, CX
  750. JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
  751. TESTQ R13, R13
  752. JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
  753. sequenceDecs_decode_bmi2_match_len_ofs_ok:
  754. ADDQ $0x18, R9
  755. MOVQ ctx+16(FP), CX
  756. DECQ 96(CX)
  757. JNS sequenceDecs_decode_bmi2_main_loop
  758. MOVQ s+0(FP), CX
  759. MOVQ R10, 144(CX)
  760. MOVQ R11, 152(CX)
  761. MOVQ R12, 160(CX)
  762. MOVQ br+8(FP), CX
  763. MOVQ AX, 32(CX)
  764. MOVB DL, 40(CX)
  765. MOVQ BX, 24(CX)
  766. // Return success
  767. MOVQ $0x00000000, ret+24(FP)
  768. RET
  769. // Return with match length error
  770. sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
  771. MOVQ $0x00000001, ret+24(FP)
  772. RET
  773. // Return with match too long error
  774. sequenceDecs_decode_bmi2_error_match_len_too_big:
  775. MOVQ $0x00000002, ret+24(FP)
  776. RET
  777. // Return with match offset too long error
  778. MOVQ $0x00000003, ret+24(FP)
  779. RET
  780. // Return with not enough literals error
  781. error_not_enough_literals:
  782. MOVQ $0x00000004, ret+24(FP)
  783. RET
  784. // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  785. // Requires: BMI, BMI2, CMOV
  786. TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
  787. MOVQ br+8(FP), CX
  788. MOVQ 32(CX), AX
  789. MOVBQZX 40(CX), DX
  790. MOVQ 24(CX), BX
  791. MOVQ (CX), CX
  792. ADDQ BX, CX
  793. MOVQ CX, (SP)
  794. MOVQ ctx+16(FP), CX
  795. MOVQ 72(CX), SI
  796. MOVQ 80(CX), DI
  797. MOVQ 88(CX), R8
  798. MOVQ 104(CX), R9
  799. MOVQ s+0(FP), CX
  800. MOVQ 144(CX), R10
  801. MOVQ 152(CX), R11
  802. MOVQ 160(CX), R12
  803. sequenceDecs_decode_56_bmi2_main_loop:
  804. MOVQ (SP), R13
  805. // Fill bitreader to have enough for the offset and match length.
  806. CMPQ BX, $0x08
  807. JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
  808. MOVQ DX, CX
  809. SHRQ $0x03, CX
  810. SUBQ CX, R13
  811. MOVQ (R13), AX
  812. SUBQ CX, BX
  813. ANDQ $0x07, DX
  814. JMP sequenceDecs_decode_56_bmi2_fill_end
  815. sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
  816. CMPQ BX, $0x00
  817. JLE sequenceDecs_decode_56_bmi2_fill_end
  818. CMPQ DX, $0x07
  819. JLE sequenceDecs_decode_56_bmi2_fill_end
  820. SHLQ $0x08, AX
  821. SUBQ $0x01, R13
  822. SUBQ $0x01, BX
  823. SUBQ $0x08, DX
  824. MOVBQZX (R13), CX
  825. ORQ CX, AX
  826. JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
  827. sequenceDecs_decode_56_bmi2_fill_end:
  828. // Update offset
  829. MOVQ $0x00000808, CX
  830. BEXTRQ CX, R8, R14
  831. MOVQ AX, R15
  832. LEAQ (DX)(R14*1), CX
  833. ROLQ CL, R15
  834. BZHIQ R14, R15, R15
  835. MOVQ CX, DX
  836. MOVQ R8, CX
  837. SHRQ $0x20, CX
  838. ADDQ R15, CX
  839. MOVQ CX, 16(R9)
  840. // Update match length
  841. MOVQ $0x00000808, CX
  842. BEXTRQ CX, DI, R14
  843. MOVQ AX, R15
  844. LEAQ (DX)(R14*1), CX
  845. ROLQ CL, R15
  846. BZHIQ R14, R15, R15
  847. MOVQ CX, DX
  848. MOVQ DI, CX
  849. SHRQ $0x20, CX
  850. ADDQ R15, CX
  851. MOVQ CX, 8(R9)
  852. // Update literal length
  853. MOVQ $0x00000808, CX
  854. BEXTRQ CX, SI, R14
  855. MOVQ AX, R15
  856. LEAQ (DX)(R14*1), CX
  857. ROLQ CL, R15
  858. BZHIQ R14, R15, R15
  859. MOVQ CX, DX
  860. MOVQ SI, CX
  861. SHRQ $0x20, CX
  862. ADDQ R15, CX
  863. MOVQ CX, (R9)
  864. // Fill bitreader for state updates
  865. MOVQ R13, (SP)
  866. MOVQ $0x00000808, CX
  867. BEXTRQ CX, R8, R13
  868. MOVQ ctx+16(FP), CX
  869. CMPQ 96(CX), $0x00
  870. JZ sequenceDecs_decode_56_bmi2_skip_update
  871. LEAQ (SI)(DI*1), R14
  872. ADDQ R8, R14
  873. MOVBQZX R14, R14
  874. LEAQ (DX)(R14*1), CX
  875. MOVQ AX, R15
  876. MOVQ CX, DX
  877. ROLQ CL, R15
  878. BZHIQ R14, R15, R15
  879. // Update Offset State
  880. BZHIQ R8, R15, CX
  881. SHRXQ R8, R15, R15
  882. MOVQ $0x00001010, R14
  883. BEXTRQ R14, R8, R8
  884. ADDQ CX, R8
  885. // Load ctx.ofTable
  886. MOVQ ctx+16(FP), CX
  887. MOVQ 48(CX), CX
  888. MOVQ (CX)(R8*8), R8
  889. // Update Match Length State
  890. BZHIQ DI, R15, CX
  891. SHRXQ DI, R15, R15
  892. MOVQ $0x00001010, R14
  893. BEXTRQ R14, DI, DI
  894. ADDQ CX, DI
  895. // Load ctx.mlTable
  896. MOVQ ctx+16(FP), CX
  897. MOVQ 24(CX), CX
  898. MOVQ (CX)(DI*8), DI
  899. // Update Literal Length State
  900. BZHIQ SI, R15, CX
  901. MOVQ $0x00001010, R14
  902. BEXTRQ R14, SI, SI
  903. ADDQ CX, SI
  904. // Load ctx.llTable
  905. MOVQ ctx+16(FP), CX
  906. MOVQ (CX), CX
  907. MOVQ (CX)(SI*8), SI
  908. sequenceDecs_decode_56_bmi2_skip_update:
  909. // Adjust offset
  910. MOVQ 16(R9), CX
  911. CMPQ R13, $0x01
  912. JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
  913. MOVQ R11, R12
  914. MOVQ R10, R11
  915. MOVQ CX, R10
  916. JMP sequenceDecs_decode_56_bmi2_after_adjust
  917. sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
  918. CMPQ (R9), $0x00000000
  919. JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
  920. INCQ CX
  921. JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  922. sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
  923. TESTQ CX, CX
  924. JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  925. MOVQ R10, CX
  926. JMP sequenceDecs_decode_56_bmi2_after_adjust
  927. sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
  928. CMPQ CX, $0x01
  929. JB sequenceDecs_decode_56_bmi2_adjust_zero
  930. JEQ sequenceDecs_decode_56_bmi2_adjust_one
  931. CMPQ CX, $0x02
  932. JA sequenceDecs_decode_56_bmi2_adjust_three
  933. JMP sequenceDecs_decode_56_bmi2_adjust_two
  934. sequenceDecs_decode_56_bmi2_adjust_zero:
  935. MOVQ R10, R13
  936. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  937. sequenceDecs_decode_56_bmi2_adjust_one:
  938. MOVQ R11, R13
  939. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  940. sequenceDecs_decode_56_bmi2_adjust_two:
  941. MOVQ R12, R13
  942. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  943. sequenceDecs_decode_56_bmi2_adjust_three:
  944. LEAQ -1(R10), R13
  945. sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
  946. TESTQ R13, R13
  947. JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
  948. MOVQ $0x00000001, R13
  949. sequenceDecs_decode_56_bmi2_adjust_temp_valid:
  950. CMPQ CX, $0x01
  951. CMOVQNE R11, R12
  952. MOVQ R10, R11
  953. MOVQ R13, R10
  954. MOVQ R13, CX
  955. sequenceDecs_decode_56_bmi2_after_adjust:
  956. MOVQ CX, 16(R9)
  957. // Check values
  958. MOVQ 8(R9), R13
  959. MOVQ (R9), R14
  960. LEAQ (R13)(R14*1), R15
  961. MOVQ s+0(FP), BP
  962. ADDQ R15, 256(BP)
  963. MOVQ ctx+16(FP), R15
  964. SUBQ R14, 128(R15)
  965. JS error_not_enough_literals
  966. CMPQ R13, $0x00020002
  967. JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
  968. TESTQ CX, CX
  969. JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
  970. TESTQ R13, R13
  971. JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
  972. sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
  973. ADDQ $0x18, R9
  974. MOVQ ctx+16(FP), CX
  975. DECQ 96(CX)
  976. JNS sequenceDecs_decode_56_bmi2_main_loop
  977. MOVQ s+0(FP), CX
  978. MOVQ R10, 144(CX)
  979. MOVQ R11, 152(CX)
  980. MOVQ R12, 160(CX)
  981. MOVQ br+8(FP), CX
  982. MOVQ AX, 32(CX)
  983. MOVB DL, 40(CX)
  984. MOVQ BX, 24(CX)
  985. // Return success
  986. MOVQ $0x00000000, ret+24(FP)
  987. RET
  988. // Return with match length error
  989. sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
  990. MOVQ $0x00000001, ret+24(FP)
  991. RET
  992. // Return with match too long error
  993. sequenceDecs_decode_56_bmi2_error_match_len_too_big:
  994. MOVQ $0x00000002, ret+24(FP)
  995. RET
  996. // Return with match offset too long error
  997. MOVQ $0x00000003, ret+24(FP)
  998. RET
  999. // Return with not enough literals error
  1000. error_not_enough_literals:
  1001. MOVQ $0x00000004, ret+24(FP)
  1002. RET
  1003. // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
  1004. // Requires: SSE
  1005. TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
  1006. MOVQ ctx+0(FP), R10
  1007. MOVQ 8(R10), CX
  1008. TESTQ CX, CX
  1009. JZ empty_seqs
  1010. MOVQ (R10), AX
  1011. MOVQ 24(R10), DX
  1012. MOVQ 32(R10), BX
  1013. MOVQ 80(R10), SI
  1014. MOVQ 104(R10), DI
  1015. MOVQ 120(R10), R8
  1016. MOVQ 56(R10), R9
  1017. MOVQ 64(R10), R10
  1018. ADDQ R10, R9
  1019. // seqsBase += 24 * seqIndex
  1020. LEAQ (DX)(DX*2), R11
  1021. SHLQ $0x03, R11
  1022. ADDQ R11, AX
  1023. // outBase += outPosition
  1024. ADDQ DI, BX
  1025. main_loop:
  1026. MOVQ (AX), R11
  1027. MOVQ 16(AX), R12
  1028. MOVQ 8(AX), R13
  1029. // Copy literals
  1030. TESTQ R11, R11
  1031. JZ check_offset
  1032. XORQ R14, R14
  1033. copy_1:
  1034. MOVUPS (SI)(R14*1), X0
  1035. MOVUPS X0, (BX)(R14*1)
  1036. ADDQ $0x10, R14
  1037. CMPQ R14, R11
  1038. JB copy_1
  1039. ADDQ R11, SI
  1040. ADDQ R11, BX
  1041. ADDQ R11, DI
  1042. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1043. check_offset:
  1044. LEAQ (DI)(R10*1), R11
  1045. CMPQ R12, R11
  1046. JG error_match_off_too_big
  1047. CMPQ R12, R8
  1048. JG error_match_off_too_big
  1049. // Copy match from history
  1050. MOVQ R12, R11
  1051. SUBQ DI, R11
  1052. JLS copy_match
  1053. MOVQ R9, R14
  1054. SUBQ R11, R14
  1055. CMPQ R13, R11
  1056. JG copy_all_from_history
  1057. MOVQ R13, R11
  1058. SUBQ $0x10, R11
  1059. JB copy_4_small
  1060. copy_4_loop:
  1061. MOVUPS (R14), X0
  1062. MOVUPS X0, (BX)
  1063. ADDQ $0x10, R14
  1064. ADDQ $0x10, BX
  1065. SUBQ $0x10, R11
  1066. JAE copy_4_loop
  1067. LEAQ 16(R14)(R11*1), R14
  1068. LEAQ 16(BX)(R11*1), BX
  1069. MOVUPS -16(R14), X0
  1070. MOVUPS X0, -16(BX)
  1071. JMP copy_4_end
  1072. copy_4_small:
  1073. CMPQ R13, $0x03
  1074. JE copy_4_move_3
  1075. CMPQ R13, $0x08
  1076. JB copy_4_move_4through7
  1077. JMP copy_4_move_8through16
  1078. copy_4_move_3:
  1079. MOVW (R14), R11
  1080. MOVB 2(R14), R12
  1081. MOVW R11, (BX)
  1082. MOVB R12, 2(BX)
  1083. ADDQ R13, R14
  1084. ADDQ R13, BX
  1085. JMP copy_4_end
  1086. copy_4_move_4through7:
  1087. MOVL (R14), R11
  1088. MOVL -4(R14)(R13*1), R12
  1089. MOVL R11, (BX)
  1090. MOVL R12, -4(BX)(R13*1)
  1091. ADDQ R13, R14
  1092. ADDQ R13, BX
  1093. JMP copy_4_end
  1094. copy_4_move_8through16:
  1095. MOVQ (R14), R11
  1096. MOVQ -8(R14)(R13*1), R12
  1097. MOVQ R11, (BX)
  1098. MOVQ R12, -8(BX)(R13*1)
  1099. ADDQ R13, R14
  1100. ADDQ R13, BX
  1101. copy_4_end:
  1102. ADDQ R13, DI
  1103. ADDQ $0x18, AX
  1104. INCQ DX
  1105. CMPQ DX, CX
  1106. JB main_loop
  1107. JMP loop_finished
  1108. copy_all_from_history:
  1109. MOVQ R11, R15
  1110. SUBQ $0x10, R15
  1111. JB copy_5_small
  1112. copy_5_loop:
  1113. MOVUPS (R14), X0
  1114. MOVUPS X0, (BX)
  1115. ADDQ $0x10, R14
  1116. ADDQ $0x10, BX
  1117. SUBQ $0x10, R15
  1118. JAE copy_5_loop
  1119. LEAQ 16(R14)(R15*1), R14
  1120. LEAQ 16(BX)(R15*1), BX
  1121. MOVUPS -16(R14), X0
  1122. MOVUPS X0, -16(BX)
  1123. JMP copy_5_end
  1124. copy_5_small:
  1125. CMPQ R11, $0x03
  1126. JE copy_5_move_3
  1127. JB copy_5_move_1or2
  1128. CMPQ R11, $0x08
  1129. JB copy_5_move_4through7
  1130. JMP copy_5_move_8through16
  1131. copy_5_move_1or2:
  1132. MOVB (R14), R15
  1133. MOVB -1(R14)(R11*1), BP
  1134. MOVB R15, (BX)
  1135. MOVB BP, -1(BX)(R11*1)
  1136. ADDQ R11, R14
  1137. ADDQ R11, BX
  1138. JMP copy_5_end
  1139. copy_5_move_3:
  1140. MOVW (R14), R15
  1141. MOVB 2(R14), BP
  1142. MOVW R15, (BX)
  1143. MOVB BP, 2(BX)
  1144. ADDQ R11, R14
  1145. ADDQ R11, BX
  1146. JMP copy_5_end
  1147. copy_5_move_4through7:
  1148. MOVL (R14), R15
  1149. MOVL -4(R14)(R11*1), BP
  1150. MOVL R15, (BX)
  1151. MOVL BP, -4(BX)(R11*1)
  1152. ADDQ R11, R14
  1153. ADDQ R11, BX
  1154. JMP copy_5_end
  1155. copy_5_move_8through16:
  1156. MOVQ (R14), R15
  1157. MOVQ -8(R14)(R11*1), BP
  1158. MOVQ R15, (BX)
  1159. MOVQ BP, -8(BX)(R11*1)
  1160. ADDQ R11, R14
  1161. ADDQ R11, BX
  1162. copy_5_end:
  1163. ADDQ R11, DI
  1164. SUBQ R11, R13
  1165. // Copy match from the current buffer
  1166. copy_match:
  1167. MOVQ BX, R11
  1168. SUBQ R12, R11
  1169. // ml <= mo
  1170. CMPQ R13, R12
  1171. JA copy_overlapping_match
  1172. // Copy non-overlapping match
  1173. ADDQ R13, DI
  1174. MOVQ BX, R12
  1175. ADDQ R13, BX
  1176. copy_2:
  1177. MOVUPS (R11), X0
  1178. MOVUPS X0, (R12)
  1179. ADDQ $0x10, R11
  1180. ADDQ $0x10, R12
  1181. SUBQ $0x10, R13
  1182. JHI copy_2
  1183. JMP handle_loop
  1184. // Copy overlapping match
  1185. copy_overlapping_match:
  1186. ADDQ R13, DI
  1187. copy_slow_3:
  1188. MOVB (R11), R12
  1189. MOVB R12, (BX)
  1190. INCQ R11
  1191. INCQ BX
  1192. DECQ R13
  1193. JNZ copy_slow_3
  1194. handle_loop:
  1195. ADDQ $0x18, AX
  1196. INCQ DX
  1197. CMPQ DX, CX
  1198. JB main_loop
  1199. loop_finished:
  1200. // Return value
  1201. MOVB $0x01, ret+8(FP)
  1202. // Update the context
  1203. MOVQ ctx+0(FP), AX
  1204. MOVQ DX, 24(AX)
  1205. MOVQ DI, 104(AX)
  1206. SUBQ 80(AX), SI
  1207. MOVQ SI, 112(AX)
  1208. RET
  1209. error_match_off_too_big:
  1210. // Return value
  1211. MOVB $0x00, ret+8(FP)
  1212. // Update the context
  1213. MOVQ ctx+0(FP), AX
  1214. MOVQ DX, 24(AX)
  1215. MOVQ DI, 104(AX)
  1216. SUBQ 80(AX), SI
  1217. MOVQ SI, 112(AX)
  1218. RET
  1219. empty_seqs:
  1220. // Return value
  1221. MOVB $0x01, ret+8(FP)
  1222. RET
  1223. // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
  1224. // Requires: SSE
  1225. TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
  1226. MOVQ ctx+0(FP), R10
  1227. MOVQ 8(R10), CX
  1228. TESTQ CX, CX
  1229. JZ empty_seqs
  1230. MOVQ (R10), AX
  1231. MOVQ 24(R10), DX
  1232. MOVQ 32(R10), BX
  1233. MOVQ 80(R10), SI
  1234. MOVQ 104(R10), DI
  1235. MOVQ 120(R10), R8
  1236. MOVQ 56(R10), R9
  1237. MOVQ 64(R10), R10
  1238. ADDQ R10, R9
  1239. // seqsBase += 24 * seqIndex
  1240. LEAQ (DX)(DX*2), R11
  1241. SHLQ $0x03, R11
  1242. ADDQ R11, AX
  1243. // outBase += outPosition
  1244. ADDQ DI, BX
  1245. main_loop:
  1246. MOVQ (AX), R11
  1247. MOVQ 16(AX), R12
  1248. MOVQ 8(AX), R13
  1249. // Copy literals
  1250. TESTQ R11, R11
  1251. JZ check_offset
  1252. MOVQ R11, R14
  1253. SUBQ $0x10, R14
  1254. JB copy_1_small
  1255. copy_1_loop:
  1256. MOVUPS (SI), X0
  1257. MOVUPS X0, (BX)
  1258. ADDQ $0x10, SI
  1259. ADDQ $0x10, BX
  1260. SUBQ $0x10, R14
  1261. JAE copy_1_loop
  1262. LEAQ 16(SI)(R14*1), SI
  1263. LEAQ 16(BX)(R14*1), BX
  1264. MOVUPS -16(SI), X0
  1265. MOVUPS X0, -16(BX)
  1266. JMP copy_1_end
  1267. copy_1_small:
  1268. CMPQ R11, $0x03
  1269. JE copy_1_move_3
  1270. JB copy_1_move_1or2
  1271. CMPQ R11, $0x08
  1272. JB copy_1_move_4through7
  1273. JMP copy_1_move_8through16
  1274. copy_1_move_1or2:
  1275. MOVB (SI), R14
  1276. MOVB -1(SI)(R11*1), R15
  1277. MOVB R14, (BX)
  1278. MOVB R15, -1(BX)(R11*1)
  1279. ADDQ R11, SI
  1280. ADDQ R11, BX
  1281. JMP copy_1_end
  1282. copy_1_move_3:
  1283. MOVW (SI), R14
  1284. MOVB 2(SI), R15
  1285. MOVW R14, (BX)
  1286. MOVB R15, 2(BX)
  1287. ADDQ R11, SI
  1288. ADDQ R11, BX
  1289. JMP copy_1_end
  1290. copy_1_move_4through7:
  1291. MOVL (SI), R14
  1292. MOVL -4(SI)(R11*1), R15
  1293. MOVL R14, (BX)
  1294. MOVL R15, -4(BX)(R11*1)
  1295. ADDQ R11, SI
  1296. ADDQ R11, BX
  1297. JMP copy_1_end
  1298. copy_1_move_8through16:
  1299. MOVQ (SI), R14
  1300. MOVQ -8(SI)(R11*1), R15
  1301. MOVQ R14, (BX)
  1302. MOVQ R15, -8(BX)(R11*1)
  1303. ADDQ R11, SI
  1304. ADDQ R11, BX
  1305. copy_1_end:
  1306. ADDQ R11, DI
  1307. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1308. check_offset:
  1309. LEAQ (DI)(R10*1), R11
  1310. CMPQ R12, R11
  1311. JG error_match_off_too_big
  1312. CMPQ R12, R8
  1313. JG error_match_off_too_big
  1314. // Copy match from history
  1315. MOVQ R12, R11
  1316. SUBQ DI, R11
  1317. JLS copy_match
  1318. MOVQ R9, R14
  1319. SUBQ R11, R14
  1320. CMPQ R13, R11
  1321. JG copy_all_from_history
  1322. MOVQ R13, R11
  1323. SUBQ $0x10, R11
  1324. JB copy_4_small
  1325. copy_4_loop:
  1326. MOVUPS (R14), X0
  1327. MOVUPS X0, (BX)
  1328. ADDQ $0x10, R14
  1329. ADDQ $0x10, BX
  1330. SUBQ $0x10, R11
  1331. JAE copy_4_loop
  1332. LEAQ 16(R14)(R11*1), R14
  1333. LEAQ 16(BX)(R11*1), BX
  1334. MOVUPS -16(R14), X0
  1335. MOVUPS X0, -16(BX)
  1336. JMP copy_4_end
  1337. copy_4_small:
  1338. CMPQ R13, $0x03
  1339. JE copy_4_move_3
  1340. CMPQ R13, $0x08
  1341. JB copy_4_move_4through7
  1342. JMP copy_4_move_8through16
  1343. copy_4_move_3:
  1344. MOVW (R14), R11
  1345. MOVB 2(R14), R12
  1346. MOVW R11, (BX)
  1347. MOVB R12, 2(BX)
  1348. ADDQ R13, R14
  1349. ADDQ R13, BX
  1350. JMP copy_4_end
  1351. copy_4_move_4through7:
  1352. MOVL (R14), R11
  1353. MOVL -4(R14)(R13*1), R12
  1354. MOVL R11, (BX)
  1355. MOVL R12, -4(BX)(R13*1)
  1356. ADDQ R13, R14
  1357. ADDQ R13, BX
  1358. JMP copy_4_end
  1359. copy_4_move_8through16:
  1360. MOVQ (R14), R11
  1361. MOVQ -8(R14)(R13*1), R12
  1362. MOVQ R11, (BX)
  1363. MOVQ R12, -8(BX)(R13*1)
  1364. ADDQ R13, R14
  1365. ADDQ R13, BX
  1366. copy_4_end:
  1367. ADDQ R13, DI
  1368. ADDQ $0x18, AX
  1369. INCQ DX
  1370. CMPQ DX, CX
  1371. JB main_loop
  1372. JMP loop_finished
  1373. copy_all_from_history:
  1374. MOVQ R11, R15
  1375. SUBQ $0x10, R15
  1376. JB copy_5_small
  1377. copy_5_loop:
  1378. MOVUPS (R14), X0
  1379. MOVUPS X0, (BX)
  1380. ADDQ $0x10, R14
  1381. ADDQ $0x10, BX
  1382. SUBQ $0x10, R15
  1383. JAE copy_5_loop
  1384. LEAQ 16(R14)(R15*1), R14
  1385. LEAQ 16(BX)(R15*1), BX
  1386. MOVUPS -16(R14), X0
  1387. MOVUPS X0, -16(BX)
  1388. JMP copy_5_end
  1389. copy_5_small:
  1390. CMPQ R11, $0x03
  1391. JE copy_5_move_3
  1392. JB copy_5_move_1or2
  1393. CMPQ R11, $0x08
  1394. JB copy_5_move_4through7
  1395. JMP copy_5_move_8through16
  1396. copy_5_move_1or2:
  1397. MOVB (R14), R15
  1398. MOVB -1(R14)(R11*1), BP
  1399. MOVB R15, (BX)
  1400. MOVB BP, -1(BX)(R11*1)
  1401. ADDQ R11, R14
  1402. ADDQ R11, BX
  1403. JMP copy_5_end
  1404. copy_5_move_3:
  1405. MOVW (R14), R15
  1406. MOVB 2(R14), BP
  1407. MOVW R15, (BX)
  1408. MOVB BP, 2(BX)
  1409. ADDQ R11, R14
  1410. ADDQ R11, BX
  1411. JMP copy_5_end
  1412. copy_5_move_4through7:
  1413. MOVL (R14), R15
  1414. MOVL -4(R14)(R11*1), BP
  1415. MOVL R15, (BX)
  1416. MOVL BP, -4(BX)(R11*1)
  1417. ADDQ R11, R14
  1418. ADDQ R11, BX
  1419. JMP copy_5_end
  1420. copy_5_move_8through16:
  1421. MOVQ (R14), R15
  1422. MOVQ -8(R14)(R11*1), BP
  1423. MOVQ R15, (BX)
  1424. MOVQ BP, -8(BX)(R11*1)
  1425. ADDQ R11, R14
  1426. ADDQ R11, BX
  1427. copy_5_end:
  1428. ADDQ R11, DI
  1429. SUBQ R11, R13
  1430. // Copy match from the current buffer
  1431. copy_match:
  1432. MOVQ BX, R11
  1433. SUBQ R12, R11
  1434. // ml <= mo
  1435. CMPQ R13, R12
  1436. JA copy_overlapping_match
  1437. // Copy non-overlapping match
  1438. ADDQ R13, DI
  1439. MOVQ R13, R12
  1440. SUBQ $0x10, R12
  1441. JB copy_2_small
  1442. copy_2_loop:
  1443. MOVUPS (R11), X0
  1444. MOVUPS X0, (BX)
  1445. ADDQ $0x10, R11
  1446. ADDQ $0x10, BX
  1447. SUBQ $0x10, R12
  1448. JAE copy_2_loop
  1449. LEAQ 16(R11)(R12*1), R11
  1450. LEAQ 16(BX)(R12*1), BX
  1451. MOVUPS -16(R11), X0
  1452. MOVUPS X0, -16(BX)
  1453. JMP copy_2_end
  1454. copy_2_small:
  1455. CMPQ R13, $0x03
  1456. JE copy_2_move_3
  1457. JB copy_2_move_1or2
  1458. CMPQ R13, $0x08
  1459. JB copy_2_move_4through7
  1460. JMP copy_2_move_8through16
  1461. copy_2_move_1or2:
  1462. MOVB (R11), R12
  1463. MOVB -1(R11)(R13*1), R14
  1464. MOVB R12, (BX)
  1465. MOVB R14, -1(BX)(R13*1)
  1466. ADDQ R13, R11
  1467. ADDQ R13, BX
  1468. JMP copy_2_end
  1469. copy_2_move_3:
  1470. MOVW (R11), R12
  1471. MOVB 2(R11), R14
  1472. MOVW R12, (BX)
  1473. MOVB R14, 2(BX)
  1474. ADDQ R13, R11
  1475. ADDQ R13, BX
  1476. JMP copy_2_end
  1477. copy_2_move_4through7:
  1478. MOVL (R11), R12
  1479. MOVL -4(R11)(R13*1), R14
  1480. MOVL R12, (BX)
  1481. MOVL R14, -4(BX)(R13*1)
  1482. ADDQ R13, R11
  1483. ADDQ R13, BX
  1484. JMP copy_2_end
  1485. copy_2_move_8through16:
  1486. MOVQ (R11), R12
  1487. MOVQ -8(R11)(R13*1), R14
  1488. MOVQ R12, (BX)
  1489. MOVQ R14, -8(BX)(R13*1)
  1490. ADDQ R13, R11
  1491. ADDQ R13, BX
  1492. copy_2_end:
  1493. JMP handle_loop
  1494. // Copy overlapping match
  1495. copy_overlapping_match:
  1496. ADDQ R13, DI
  1497. copy_slow_3:
  1498. MOVB (R11), R12
  1499. MOVB R12, (BX)
  1500. INCQ R11
  1501. INCQ BX
  1502. DECQ R13
  1503. JNZ copy_slow_3
  1504. handle_loop:
  1505. ADDQ $0x18, AX
  1506. INCQ DX
  1507. CMPQ DX, CX
  1508. JB main_loop
  1509. loop_finished:
  1510. // Return value
  1511. MOVB $0x01, ret+8(FP)
  1512. // Update the context
  1513. MOVQ ctx+0(FP), AX
  1514. MOVQ DX, 24(AX)
  1515. MOVQ DI, 104(AX)
  1516. SUBQ 80(AX), SI
  1517. MOVQ SI, 112(AX)
  1518. RET
  1519. error_match_off_too_big:
  1520. // Return value
  1521. MOVB $0x00, ret+8(FP)
  1522. // Update the context
  1523. MOVQ ctx+0(FP), AX
  1524. MOVQ DX, 24(AX)
  1525. MOVQ DI, 104(AX)
  1526. SUBQ 80(AX), SI
  1527. MOVQ SI, 112(AX)
  1528. RET
  1529. empty_seqs:
  1530. // Return value
  1531. MOVB $0x01, ret+8(FP)
  1532. RET
  1533. // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  1534. // Requires: CMOV, SSE
  1535. TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
  1536. MOVQ br+8(FP), AX
  1537. MOVQ 32(AX), DX
  1538. MOVBQZX 40(AX), BX
  1539. MOVQ 24(AX), SI
  1540. MOVQ (AX), AX
  1541. ADDQ SI, AX
  1542. MOVQ AX, (SP)
  1543. MOVQ ctx+16(FP), AX
  1544. MOVQ 72(AX), DI
  1545. MOVQ 80(AX), R8
  1546. MOVQ 88(AX), R9
  1547. XORQ CX, CX
  1548. MOVQ CX, 8(SP)
  1549. MOVQ CX, 16(SP)
  1550. MOVQ CX, 24(SP)
  1551. MOVQ 112(AX), R10
  1552. MOVQ 128(AX), CX
  1553. MOVQ CX, 32(SP)
  1554. MOVQ 144(AX), R11
  1555. MOVQ 136(AX), R12
  1556. MOVQ 200(AX), CX
  1557. MOVQ CX, 56(SP)
  1558. MOVQ 176(AX), CX
  1559. MOVQ CX, 48(SP)
  1560. MOVQ 184(AX), AX
  1561. MOVQ AX, 40(SP)
  1562. MOVQ 40(SP), AX
  1563. ADDQ AX, 48(SP)
  1564. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  1565. ADDQ R10, 32(SP)
  1566. // outBase += outPosition
  1567. ADDQ R12, R10
  1568. sequenceDecs_decodeSync_amd64_main_loop:
  1569. MOVQ (SP), R13
  1570. // Fill bitreader to have enough for the offset and match length.
  1571. CMPQ SI, $0x08
  1572. JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1573. MOVQ BX, AX
  1574. SHRQ $0x03, AX
  1575. SUBQ AX, R13
  1576. MOVQ (R13), DX
  1577. SUBQ AX, SI
  1578. ANDQ $0x07, BX
  1579. JMP sequenceDecs_decodeSync_amd64_fill_end
  1580. sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
  1581. CMPQ SI, $0x00
  1582. JLE sequenceDecs_decodeSync_amd64_fill_end
  1583. CMPQ BX, $0x07
  1584. JLE sequenceDecs_decodeSync_amd64_fill_end
  1585. SHLQ $0x08, DX
  1586. SUBQ $0x01, R13
  1587. SUBQ $0x01, SI
  1588. SUBQ $0x08, BX
  1589. MOVBQZX (R13), AX
  1590. ORQ AX, DX
  1591. JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1592. sequenceDecs_decodeSync_amd64_fill_end:
  1593. // Update offset
  1594. MOVQ R9, AX
  1595. MOVQ BX, CX
  1596. MOVQ DX, R14
  1597. SHLQ CL, R14
  1598. MOVB AH, CL
  1599. SHRQ $0x20, AX
  1600. TESTQ CX, CX
  1601. JZ sequenceDecs_decodeSync_amd64_of_update_zero
  1602. ADDQ CX, BX
  1603. CMPQ BX, $0x40
  1604. JA sequenceDecs_decodeSync_amd64_of_update_zero
  1605. CMPQ CX, $0x40
  1606. JAE sequenceDecs_decodeSync_amd64_of_update_zero
  1607. NEGQ CX
  1608. SHRQ CL, R14
  1609. ADDQ R14, AX
  1610. sequenceDecs_decodeSync_amd64_of_update_zero:
  1611. MOVQ AX, 8(SP)
  1612. // Update match length
  1613. MOVQ R8, AX
  1614. MOVQ BX, CX
  1615. MOVQ DX, R14
  1616. SHLQ CL, R14
  1617. MOVB AH, CL
  1618. SHRQ $0x20, AX
  1619. TESTQ CX, CX
  1620. JZ sequenceDecs_decodeSync_amd64_ml_update_zero
  1621. ADDQ CX, BX
  1622. CMPQ BX, $0x40
  1623. JA sequenceDecs_decodeSync_amd64_ml_update_zero
  1624. CMPQ CX, $0x40
  1625. JAE sequenceDecs_decodeSync_amd64_ml_update_zero
  1626. NEGQ CX
  1627. SHRQ CL, R14
  1628. ADDQ R14, AX
  1629. sequenceDecs_decodeSync_amd64_ml_update_zero:
  1630. MOVQ AX, 16(SP)
  1631. // Fill bitreader to have enough for the remaining
  1632. CMPQ SI, $0x08
  1633. JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1634. MOVQ BX, AX
  1635. SHRQ $0x03, AX
  1636. SUBQ AX, R13
  1637. MOVQ (R13), DX
  1638. SUBQ AX, SI
  1639. ANDQ $0x07, BX
  1640. JMP sequenceDecs_decodeSync_amd64_fill_2_end
  1641. sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
  1642. CMPQ SI, $0x00
  1643. JLE sequenceDecs_decodeSync_amd64_fill_2_end
  1644. CMPQ BX, $0x07
  1645. JLE sequenceDecs_decodeSync_amd64_fill_2_end
  1646. SHLQ $0x08, DX
  1647. SUBQ $0x01, R13
  1648. SUBQ $0x01, SI
  1649. SUBQ $0x08, BX
  1650. MOVBQZX (R13), AX
  1651. ORQ AX, DX
  1652. JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1653. sequenceDecs_decodeSync_amd64_fill_2_end:
  1654. // Update literal length
  1655. MOVQ DI, AX
  1656. MOVQ BX, CX
  1657. MOVQ DX, R14
  1658. SHLQ CL, R14
  1659. MOVB AH, CL
  1660. SHRQ $0x20, AX
  1661. TESTQ CX, CX
  1662. JZ sequenceDecs_decodeSync_amd64_ll_update_zero
  1663. ADDQ CX, BX
  1664. CMPQ BX, $0x40
  1665. JA sequenceDecs_decodeSync_amd64_ll_update_zero
  1666. CMPQ CX, $0x40
  1667. JAE sequenceDecs_decodeSync_amd64_ll_update_zero
  1668. NEGQ CX
  1669. SHRQ CL, R14
  1670. ADDQ R14, AX
  1671. sequenceDecs_decodeSync_amd64_ll_update_zero:
  1672. MOVQ AX, 24(SP)
  1673. // Fill bitreader for state updates
  1674. MOVQ R13, (SP)
  1675. MOVQ R9, AX
  1676. SHRQ $0x08, AX
  1677. MOVBQZX AL, AX
  1678. MOVQ ctx+16(FP), CX
  1679. CMPQ 96(CX), $0x00
  1680. JZ sequenceDecs_decodeSync_amd64_skip_update
  1681. // Update Literal Length State
  1682. MOVBQZX DI, R13
  1683. SHRQ $0x10, DI
  1684. MOVWQZX DI, DI
  1685. LEAQ (BX)(R13*1), CX
  1686. MOVQ DX, R14
  1687. MOVQ CX, BX
  1688. ROLQ CL, R14
  1689. MOVL $0x00000001, R15
  1690. MOVB R13, CL
  1691. SHLL CL, R15
  1692. DECL R15
  1693. ANDQ R15, R14
  1694. ADDQ R14, DI
  1695. // Load ctx.llTable
  1696. MOVQ ctx+16(FP), CX
  1697. MOVQ (CX), CX
  1698. MOVQ (CX)(DI*8), DI
  1699. // Update Match Length State
  1700. MOVBQZX R8, R13
  1701. SHRQ $0x10, R8
  1702. MOVWQZX R8, R8
  1703. LEAQ (BX)(R13*1), CX
  1704. MOVQ DX, R14
  1705. MOVQ CX, BX
  1706. ROLQ CL, R14
  1707. MOVL $0x00000001, R15
  1708. MOVB R13, CL
  1709. SHLL CL, R15
  1710. DECL R15
  1711. ANDQ R15, R14
  1712. ADDQ R14, R8
  1713. // Load ctx.mlTable
  1714. MOVQ ctx+16(FP), CX
  1715. MOVQ 24(CX), CX
  1716. MOVQ (CX)(R8*8), R8
  1717. // Update Offset State
  1718. MOVBQZX R9, R13
  1719. SHRQ $0x10, R9
  1720. MOVWQZX R9, R9
  1721. LEAQ (BX)(R13*1), CX
  1722. MOVQ DX, R14
  1723. MOVQ CX, BX
  1724. ROLQ CL, R14
  1725. MOVL $0x00000001, R15
  1726. MOVB R13, CL
  1727. SHLL CL, R15
  1728. DECL R15
  1729. ANDQ R15, R14
  1730. ADDQ R14, R9
  1731. // Load ctx.ofTable
  1732. MOVQ ctx+16(FP), CX
  1733. MOVQ 48(CX), CX
  1734. MOVQ (CX)(R9*8), R9
  1735. sequenceDecs_decodeSync_amd64_skip_update:
  1736. // Adjust offset
  1737. MOVQ s+0(FP), CX
  1738. MOVQ 8(SP), R13
  1739. CMPQ AX, $0x01
  1740. JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
  1741. MOVUPS 144(CX), X0
  1742. MOVQ R13, 144(CX)
  1743. MOVUPS X0, 152(CX)
  1744. JMP sequenceDecs_decodeSync_amd64_after_adjust
  1745. sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
  1746. CMPQ 24(SP), $0x00000000
  1747. JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
  1748. INCQ R13
  1749. JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  1750. sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
  1751. TESTQ R13, R13
  1752. JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  1753. MOVQ 144(CX), R13
  1754. JMP sequenceDecs_decodeSync_amd64_after_adjust
  1755. sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
  1756. MOVQ R13, AX
  1757. XORQ R14, R14
  1758. MOVQ $-1, R15
  1759. CMPQ R13, $0x03
  1760. CMOVQEQ R14, AX
  1761. CMOVQEQ R15, R14
  1762. ADDQ 144(CX)(AX*8), R14
  1763. JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
  1764. MOVQ $0x00000001, R14
  1765. sequenceDecs_decodeSync_amd64_adjust_temp_valid:
  1766. CMPQ R13, $0x01
  1767. JZ sequenceDecs_decodeSync_amd64_adjust_skip
  1768. MOVQ 152(CX), AX
  1769. MOVQ AX, 160(CX)
  1770. sequenceDecs_decodeSync_amd64_adjust_skip:
  1771. MOVQ 144(CX), AX
  1772. MOVQ AX, 152(CX)
  1773. MOVQ R14, 144(CX)
  1774. MOVQ R14, R13
  1775. sequenceDecs_decodeSync_amd64_after_adjust:
  1776. MOVQ R13, 8(SP)
  1777. // Check values
  1778. MOVQ 16(SP), AX
  1779. MOVQ 24(SP), CX
  1780. LEAQ (AX)(CX*1), R14
  1781. MOVQ s+0(FP), R15
  1782. ADDQ R14, 256(R15)
  1783. MOVQ ctx+16(FP), R14
  1784. SUBQ CX, 104(R14)
  1785. JS error_not_enough_literals
  1786. CMPQ AX, $0x00020002
  1787. JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
  1788. TESTQ R13, R13
  1789. JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
  1790. TESTQ AX, AX
  1791. JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
  1792. sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
  1793. MOVQ 24(SP), AX
  1794. MOVQ 8(SP), CX
  1795. MOVQ 16(SP), R13
  1796. // Check if we have enough space in s.out
  1797. LEAQ (AX)(R13*1), R14
  1798. ADDQ R10, R14
  1799. CMPQ R14, 32(SP)
  1800. JA error_not_enough_space
  1801. // Copy literals
  1802. TESTQ AX, AX
  1803. JZ check_offset
  1804. XORQ R14, R14
  1805. copy_1:
  1806. MOVUPS (R11)(R14*1), X0
  1807. MOVUPS X0, (R10)(R14*1)
  1808. ADDQ $0x10, R14
  1809. CMPQ R14, AX
  1810. JB copy_1
  1811. ADDQ AX, R11
  1812. ADDQ AX, R10
  1813. ADDQ AX, R12
  1814. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1815. check_offset:
  1816. MOVQ R12, AX
  1817. ADDQ 40(SP), AX
  1818. CMPQ CX, AX
  1819. JG error_match_off_too_big
  1820. CMPQ CX, 56(SP)
  1821. JG error_match_off_too_big
  1822. // Copy match from history
  1823. MOVQ CX, AX
  1824. SUBQ R12, AX
  1825. JLS copy_match
  1826. MOVQ 48(SP), R14
  1827. SUBQ AX, R14
  1828. CMPQ R13, AX
  1829. JG copy_all_from_history
  1830. MOVQ R13, AX
  1831. SUBQ $0x10, AX
  1832. JB copy_4_small
  1833. copy_4_loop:
  1834. MOVUPS (R14), X0
  1835. MOVUPS X0, (R10)
  1836. ADDQ $0x10, R14
  1837. ADDQ $0x10, R10
  1838. SUBQ $0x10, AX
  1839. JAE copy_4_loop
  1840. LEAQ 16(R14)(AX*1), R14
  1841. LEAQ 16(R10)(AX*1), R10
  1842. MOVUPS -16(R14), X0
  1843. MOVUPS X0, -16(R10)
  1844. JMP copy_4_end
  1845. copy_4_small:
  1846. CMPQ R13, $0x03
  1847. JE copy_4_move_3
  1848. CMPQ R13, $0x08
  1849. JB copy_4_move_4through7
  1850. JMP copy_4_move_8through16
  1851. copy_4_move_3:
  1852. MOVW (R14), AX
  1853. MOVB 2(R14), CL
  1854. MOVW AX, (R10)
  1855. MOVB CL, 2(R10)
  1856. ADDQ R13, R14
  1857. ADDQ R13, R10
  1858. JMP copy_4_end
  1859. copy_4_move_4through7:
  1860. MOVL (R14), AX
  1861. MOVL -4(R14)(R13*1), CX
  1862. MOVL AX, (R10)
  1863. MOVL CX, -4(R10)(R13*1)
  1864. ADDQ R13, R14
  1865. ADDQ R13, R10
  1866. JMP copy_4_end
  1867. copy_4_move_8through16:
  1868. MOVQ (R14), AX
  1869. MOVQ -8(R14)(R13*1), CX
  1870. MOVQ AX, (R10)
  1871. MOVQ CX, -8(R10)(R13*1)
  1872. ADDQ R13, R14
  1873. ADDQ R13, R10
  1874. copy_4_end:
  1875. ADDQ R13, R12
  1876. JMP handle_loop
  1877. JMP loop_finished
  1878. copy_all_from_history:
  1879. MOVQ AX, R15
  1880. SUBQ $0x10, R15
  1881. JB copy_5_small
  1882. copy_5_loop:
  1883. MOVUPS (R14), X0
  1884. MOVUPS X0, (R10)
  1885. ADDQ $0x10, R14
  1886. ADDQ $0x10, R10
  1887. SUBQ $0x10, R15
  1888. JAE copy_5_loop
  1889. LEAQ 16(R14)(R15*1), R14
  1890. LEAQ 16(R10)(R15*1), R10
  1891. MOVUPS -16(R14), X0
  1892. MOVUPS X0, -16(R10)
  1893. JMP copy_5_end
  1894. copy_5_small:
  1895. CMPQ AX, $0x03
  1896. JE copy_5_move_3
  1897. JB copy_5_move_1or2
  1898. CMPQ AX, $0x08
  1899. JB copy_5_move_4through7
  1900. JMP copy_5_move_8through16
  1901. copy_5_move_1or2:
  1902. MOVB (R14), R15
  1903. MOVB -1(R14)(AX*1), BP
  1904. MOVB R15, (R10)
  1905. MOVB BP, -1(R10)(AX*1)
  1906. ADDQ AX, R14
  1907. ADDQ AX, R10
  1908. JMP copy_5_end
  1909. copy_5_move_3:
  1910. MOVW (R14), R15
  1911. MOVB 2(R14), BP
  1912. MOVW R15, (R10)
  1913. MOVB BP, 2(R10)
  1914. ADDQ AX, R14
  1915. ADDQ AX, R10
  1916. JMP copy_5_end
  1917. copy_5_move_4through7:
  1918. MOVL (R14), R15
  1919. MOVL -4(R14)(AX*1), BP
  1920. MOVL R15, (R10)
  1921. MOVL BP, -4(R10)(AX*1)
  1922. ADDQ AX, R14
  1923. ADDQ AX, R10
  1924. JMP copy_5_end
  1925. copy_5_move_8through16:
  1926. MOVQ (R14), R15
  1927. MOVQ -8(R14)(AX*1), BP
  1928. MOVQ R15, (R10)
  1929. MOVQ BP, -8(R10)(AX*1)
  1930. ADDQ AX, R14
  1931. ADDQ AX, R10
  1932. copy_5_end:
  1933. ADDQ AX, R12
  1934. SUBQ AX, R13
  1935. // Copy match from the current buffer
  1936. copy_match:
  1937. MOVQ R10, AX
  1938. SUBQ CX, AX
  1939. // ml <= mo
  1940. CMPQ R13, CX
  1941. JA copy_overlapping_match
  1942. // Copy non-overlapping match
  1943. ADDQ R13, R12
  1944. MOVQ R10, CX
  1945. ADDQ R13, R10
  1946. copy_2:
  1947. MOVUPS (AX), X0
  1948. MOVUPS X0, (CX)
  1949. ADDQ $0x10, AX
  1950. ADDQ $0x10, CX
  1951. SUBQ $0x10, R13
  1952. JHI copy_2
  1953. JMP handle_loop
  1954. // Copy overlapping match
  1955. copy_overlapping_match:
  1956. ADDQ R13, R12
  1957. copy_slow_3:
  1958. MOVB (AX), CL
  1959. MOVB CL, (R10)
  1960. INCQ AX
  1961. INCQ R10
  1962. DECQ R13
  1963. JNZ copy_slow_3
  1964. handle_loop:
  1965. MOVQ ctx+16(FP), AX
  1966. DECQ 96(AX)
  1967. JNS sequenceDecs_decodeSync_amd64_main_loop
  1968. loop_finished:
  1969. MOVQ br+8(FP), AX
  1970. MOVQ DX, 32(AX)
  1971. MOVB BL, 40(AX)
  1972. MOVQ SI, 24(AX)
  1973. // Update the context
  1974. MOVQ ctx+16(FP), AX
  1975. MOVQ R12, 136(AX)
  1976. MOVQ 144(AX), CX
  1977. SUBQ CX, R11
  1978. MOVQ R11, 168(AX)
  1979. // Return success
  1980. MOVQ $0x00000000, ret+24(FP)
  1981. RET
  1982. // Return with match length error
  1983. sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
  1984. MOVQ 16(SP), AX
  1985. MOVQ ctx+16(FP), CX
  1986. MOVQ AX, 216(CX)
  1987. MOVQ $0x00000001, ret+24(FP)
  1988. RET
  1989. // Return with match too long error
  1990. sequenceDecs_decodeSync_amd64_error_match_len_too_big:
  1991. MOVQ ctx+16(FP), AX
  1992. MOVQ 16(SP), CX
  1993. MOVQ CX, 216(AX)
  1994. MOVQ $0x00000002, ret+24(FP)
  1995. RET
  1996. // Return with match offset too long error
  1997. error_match_off_too_big:
  1998. MOVQ ctx+16(FP), AX
  1999. MOVQ 8(SP), CX
  2000. MOVQ CX, 224(AX)
  2001. MOVQ R12, 136(AX)
  2002. MOVQ $0x00000003, ret+24(FP)
  2003. RET
  2004. // Return with not enough literals error
  2005. error_not_enough_literals:
  2006. MOVQ ctx+16(FP), AX
  2007. MOVQ 24(SP), CX
  2008. MOVQ CX, 208(AX)
  2009. MOVQ $0x00000004, ret+24(FP)
  2010. RET
  2011. // Return with not enough output space error
  2012. error_not_enough_space:
  2013. MOVQ ctx+16(FP), AX
  2014. MOVQ 24(SP), CX
  2015. MOVQ CX, 208(AX)
  2016. MOVQ 16(SP), CX
  2017. MOVQ CX, 216(AX)
  2018. MOVQ R12, 136(AX)
  2019. MOVQ $0x00000005, ret+24(FP)
  2020. RET
  2021. // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2022. // Requires: BMI, BMI2, CMOV, SSE
  2023. TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
  2024. MOVQ br+8(FP), CX
  2025. MOVQ 32(CX), AX
  2026. MOVBQZX 40(CX), DX
  2027. MOVQ 24(CX), BX
  2028. MOVQ (CX), CX
  2029. ADDQ BX, CX
  2030. MOVQ CX, (SP)
  2031. MOVQ ctx+16(FP), CX
  2032. MOVQ 72(CX), SI
  2033. MOVQ 80(CX), DI
  2034. MOVQ 88(CX), R8
  2035. XORQ R9, R9
  2036. MOVQ R9, 8(SP)
  2037. MOVQ R9, 16(SP)
  2038. MOVQ R9, 24(SP)
  2039. MOVQ 112(CX), R9
  2040. MOVQ 128(CX), R10
  2041. MOVQ R10, 32(SP)
  2042. MOVQ 144(CX), R10
  2043. MOVQ 136(CX), R11
  2044. MOVQ 200(CX), R12
  2045. MOVQ R12, 56(SP)
  2046. MOVQ 176(CX), R12
  2047. MOVQ R12, 48(SP)
  2048. MOVQ 184(CX), CX
  2049. MOVQ CX, 40(SP)
  2050. MOVQ 40(SP), CX
  2051. ADDQ CX, 48(SP)
  2052. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2053. ADDQ R9, 32(SP)
  2054. // outBase += outPosition
  2055. ADDQ R11, R9
  2056. sequenceDecs_decodeSync_bmi2_main_loop:
  2057. MOVQ (SP), R12
  2058. // Fill bitreader to have enough for the offset and match length.
  2059. CMPQ BX, $0x08
  2060. JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2061. MOVQ DX, CX
  2062. SHRQ $0x03, CX
  2063. SUBQ CX, R12
  2064. MOVQ (R12), AX
  2065. SUBQ CX, BX
  2066. ANDQ $0x07, DX
  2067. JMP sequenceDecs_decodeSync_bmi2_fill_end
  2068. sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
  2069. CMPQ BX, $0x00
  2070. JLE sequenceDecs_decodeSync_bmi2_fill_end
  2071. CMPQ DX, $0x07
  2072. JLE sequenceDecs_decodeSync_bmi2_fill_end
  2073. SHLQ $0x08, AX
  2074. SUBQ $0x01, R12
  2075. SUBQ $0x01, BX
  2076. SUBQ $0x08, DX
  2077. MOVBQZX (R12), CX
  2078. ORQ CX, AX
  2079. JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2080. sequenceDecs_decodeSync_bmi2_fill_end:
  2081. // Update offset
  2082. MOVQ $0x00000808, CX
  2083. BEXTRQ CX, R8, R13
  2084. MOVQ AX, R14
  2085. LEAQ (DX)(R13*1), CX
  2086. ROLQ CL, R14
  2087. BZHIQ R13, R14, R14
  2088. MOVQ CX, DX
  2089. MOVQ R8, CX
  2090. SHRQ $0x20, CX
  2091. ADDQ R14, CX
  2092. MOVQ CX, 8(SP)
  2093. // Update match length
  2094. MOVQ $0x00000808, CX
  2095. BEXTRQ CX, DI, R13
  2096. MOVQ AX, R14
  2097. LEAQ (DX)(R13*1), CX
  2098. ROLQ CL, R14
  2099. BZHIQ R13, R14, R14
  2100. MOVQ CX, DX
  2101. MOVQ DI, CX
  2102. SHRQ $0x20, CX
  2103. ADDQ R14, CX
  2104. MOVQ CX, 16(SP)
  2105. // Fill bitreader to have enough for the remaining
  2106. CMPQ BX, $0x08
  2107. JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2108. MOVQ DX, CX
  2109. SHRQ $0x03, CX
  2110. SUBQ CX, R12
  2111. MOVQ (R12), AX
  2112. SUBQ CX, BX
  2113. ANDQ $0x07, DX
  2114. JMP sequenceDecs_decodeSync_bmi2_fill_2_end
  2115. sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
  2116. CMPQ BX, $0x00
  2117. JLE sequenceDecs_decodeSync_bmi2_fill_2_end
  2118. CMPQ DX, $0x07
  2119. JLE sequenceDecs_decodeSync_bmi2_fill_2_end
  2120. SHLQ $0x08, AX
  2121. SUBQ $0x01, R12
  2122. SUBQ $0x01, BX
  2123. SUBQ $0x08, DX
  2124. MOVBQZX (R12), CX
  2125. ORQ CX, AX
  2126. JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2127. sequenceDecs_decodeSync_bmi2_fill_2_end:
  2128. // Update literal length
  2129. MOVQ $0x00000808, CX
  2130. BEXTRQ CX, SI, R13
  2131. MOVQ AX, R14
  2132. LEAQ (DX)(R13*1), CX
  2133. ROLQ CL, R14
  2134. BZHIQ R13, R14, R14
  2135. MOVQ CX, DX
  2136. MOVQ SI, CX
  2137. SHRQ $0x20, CX
  2138. ADDQ R14, CX
  2139. MOVQ CX, 24(SP)
  2140. // Fill bitreader for state updates
  2141. MOVQ R12, (SP)
  2142. MOVQ $0x00000808, CX
  2143. BEXTRQ CX, R8, R12
  2144. MOVQ ctx+16(FP), CX
  2145. CMPQ 96(CX), $0x00
  2146. JZ sequenceDecs_decodeSync_bmi2_skip_update
  2147. LEAQ (SI)(DI*1), R13
  2148. ADDQ R8, R13
  2149. MOVBQZX R13, R13
  2150. LEAQ (DX)(R13*1), CX
  2151. MOVQ AX, R14
  2152. MOVQ CX, DX
  2153. ROLQ CL, R14
  2154. BZHIQ R13, R14, R14
  2155. // Update Offset State
  2156. BZHIQ R8, R14, CX
  2157. SHRXQ R8, R14, R14
  2158. MOVQ $0x00001010, R13
  2159. BEXTRQ R13, R8, R8
  2160. ADDQ CX, R8
  2161. // Load ctx.ofTable
  2162. MOVQ ctx+16(FP), CX
  2163. MOVQ 48(CX), CX
  2164. MOVQ (CX)(R8*8), R8
  2165. // Update Match Length State
  2166. BZHIQ DI, R14, CX
  2167. SHRXQ DI, R14, R14
  2168. MOVQ $0x00001010, R13
  2169. BEXTRQ R13, DI, DI
  2170. ADDQ CX, DI
  2171. // Load ctx.mlTable
  2172. MOVQ ctx+16(FP), CX
  2173. MOVQ 24(CX), CX
  2174. MOVQ (CX)(DI*8), DI
  2175. // Update Literal Length State
  2176. BZHIQ SI, R14, CX
  2177. MOVQ $0x00001010, R13
  2178. BEXTRQ R13, SI, SI
  2179. ADDQ CX, SI
  2180. // Load ctx.llTable
  2181. MOVQ ctx+16(FP), CX
  2182. MOVQ (CX), CX
  2183. MOVQ (CX)(SI*8), SI
  2184. sequenceDecs_decodeSync_bmi2_skip_update:
  2185. // Adjust offset
  2186. MOVQ s+0(FP), CX
  2187. MOVQ 8(SP), R13
  2188. CMPQ R12, $0x01
  2189. JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
  2190. MOVUPS 144(CX), X0
  2191. MOVQ R13, 144(CX)
  2192. MOVUPS X0, 152(CX)
  2193. JMP sequenceDecs_decodeSync_bmi2_after_adjust
  2194. sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
  2195. CMPQ 24(SP), $0x00000000
  2196. JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
  2197. INCQ R13
  2198. JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2199. sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
  2200. TESTQ R13, R13
  2201. JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2202. MOVQ 144(CX), R13
  2203. JMP sequenceDecs_decodeSync_bmi2_after_adjust
  2204. sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
  2205. MOVQ R13, R12
  2206. XORQ R14, R14
  2207. MOVQ $-1, R15
  2208. CMPQ R13, $0x03
  2209. CMOVQEQ R14, R12
  2210. CMOVQEQ R15, R14
  2211. ADDQ 144(CX)(R12*8), R14
  2212. JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
  2213. MOVQ $0x00000001, R14
  2214. sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
  2215. CMPQ R13, $0x01
  2216. JZ sequenceDecs_decodeSync_bmi2_adjust_skip
  2217. MOVQ 152(CX), R12
  2218. MOVQ R12, 160(CX)
  2219. sequenceDecs_decodeSync_bmi2_adjust_skip:
  2220. MOVQ 144(CX), R12
  2221. MOVQ R12, 152(CX)
  2222. MOVQ R14, 144(CX)
  2223. MOVQ R14, R13
  2224. sequenceDecs_decodeSync_bmi2_after_adjust:
  2225. MOVQ R13, 8(SP)
  2226. // Check values
  2227. MOVQ 16(SP), CX
  2228. MOVQ 24(SP), R12
  2229. LEAQ (CX)(R12*1), R14
  2230. MOVQ s+0(FP), R15
  2231. ADDQ R14, 256(R15)
  2232. MOVQ ctx+16(FP), R14
  2233. SUBQ R12, 104(R14)
  2234. JS error_not_enough_literals
  2235. CMPQ CX, $0x00020002
  2236. JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
  2237. TESTQ R13, R13
  2238. JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
  2239. TESTQ CX, CX
  2240. JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
  2241. sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
  2242. MOVQ 24(SP), CX
  2243. MOVQ 8(SP), R12
  2244. MOVQ 16(SP), R13
  2245. // Check if we have enough space in s.out
  2246. LEAQ (CX)(R13*1), R14
  2247. ADDQ R9, R14
  2248. CMPQ R14, 32(SP)
  2249. JA error_not_enough_space
  2250. // Copy literals
  2251. TESTQ CX, CX
  2252. JZ check_offset
  2253. XORQ R14, R14
  2254. copy_1:
  2255. MOVUPS (R10)(R14*1), X0
  2256. MOVUPS X0, (R9)(R14*1)
  2257. ADDQ $0x10, R14
  2258. CMPQ R14, CX
  2259. JB copy_1
  2260. ADDQ CX, R10
  2261. ADDQ CX, R9
  2262. ADDQ CX, R11
  2263. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2264. check_offset:
  2265. MOVQ R11, CX
  2266. ADDQ 40(SP), CX
  2267. CMPQ R12, CX
  2268. JG error_match_off_too_big
  2269. CMPQ R12, 56(SP)
  2270. JG error_match_off_too_big
  2271. // Copy match from history
  2272. MOVQ R12, CX
  2273. SUBQ R11, CX
  2274. JLS copy_match
  2275. MOVQ 48(SP), R14
  2276. SUBQ CX, R14
  2277. CMPQ R13, CX
  2278. JG copy_all_from_history
  2279. MOVQ R13, CX
  2280. SUBQ $0x10, CX
  2281. JB copy_4_small
  2282. copy_4_loop:
  2283. MOVUPS (R14), X0
  2284. MOVUPS X0, (R9)
  2285. ADDQ $0x10, R14
  2286. ADDQ $0x10, R9
  2287. SUBQ $0x10, CX
  2288. JAE copy_4_loop
  2289. LEAQ 16(R14)(CX*1), R14
  2290. LEAQ 16(R9)(CX*1), R9
  2291. MOVUPS -16(R14), X0
  2292. MOVUPS X0, -16(R9)
  2293. JMP copy_4_end
  2294. copy_4_small:
  2295. CMPQ R13, $0x03
  2296. JE copy_4_move_3
  2297. CMPQ R13, $0x08
  2298. JB copy_4_move_4through7
  2299. JMP copy_4_move_8through16
  2300. copy_4_move_3:
  2301. MOVW (R14), CX
  2302. MOVB 2(R14), R12
  2303. MOVW CX, (R9)
  2304. MOVB R12, 2(R9)
  2305. ADDQ R13, R14
  2306. ADDQ R13, R9
  2307. JMP copy_4_end
  2308. copy_4_move_4through7:
  2309. MOVL (R14), CX
  2310. MOVL -4(R14)(R13*1), R12
  2311. MOVL CX, (R9)
  2312. MOVL R12, -4(R9)(R13*1)
  2313. ADDQ R13, R14
  2314. ADDQ R13, R9
  2315. JMP copy_4_end
  2316. copy_4_move_8through16:
  2317. MOVQ (R14), CX
  2318. MOVQ -8(R14)(R13*1), R12
  2319. MOVQ CX, (R9)
  2320. MOVQ R12, -8(R9)(R13*1)
  2321. ADDQ R13, R14
  2322. ADDQ R13, R9
  2323. copy_4_end:
  2324. ADDQ R13, R11
  2325. JMP handle_loop
  2326. JMP loop_finished
  2327. copy_all_from_history:
  2328. MOVQ CX, R15
  2329. SUBQ $0x10, R15
  2330. JB copy_5_small
  2331. copy_5_loop:
  2332. MOVUPS (R14), X0
  2333. MOVUPS X0, (R9)
  2334. ADDQ $0x10, R14
  2335. ADDQ $0x10, R9
  2336. SUBQ $0x10, R15
  2337. JAE copy_5_loop
  2338. LEAQ 16(R14)(R15*1), R14
  2339. LEAQ 16(R9)(R15*1), R9
  2340. MOVUPS -16(R14), X0
  2341. MOVUPS X0, -16(R9)
  2342. JMP copy_5_end
  2343. copy_5_small:
  2344. CMPQ CX, $0x03
  2345. JE copy_5_move_3
  2346. JB copy_5_move_1or2
  2347. CMPQ CX, $0x08
  2348. JB copy_5_move_4through7
  2349. JMP copy_5_move_8through16
  2350. copy_5_move_1or2:
  2351. MOVB (R14), R15
  2352. MOVB -1(R14)(CX*1), BP
  2353. MOVB R15, (R9)
  2354. MOVB BP, -1(R9)(CX*1)
  2355. ADDQ CX, R14
  2356. ADDQ CX, R9
  2357. JMP copy_5_end
  2358. copy_5_move_3:
  2359. MOVW (R14), R15
  2360. MOVB 2(R14), BP
  2361. MOVW R15, (R9)
  2362. MOVB BP, 2(R9)
  2363. ADDQ CX, R14
  2364. ADDQ CX, R9
  2365. JMP copy_5_end
  2366. copy_5_move_4through7:
  2367. MOVL (R14), R15
  2368. MOVL -4(R14)(CX*1), BP
  2369. MOVL R15, (R9)
  2370. MOVL BP, -4(R9)(CX*1)
  2371. ADDQ CX, R14
  2372. ADDQ CX, R9
  2373. JMP copy_5_end
  2374. copy_5_move_8through16:
  2375. MOVQ (R14), R15
  2376. MOVQ -8(R14)(CX*1), BP
  2377. MOVQ R15, (R9)
  2378. MOVQ BP, -8(R9)(CX*1)
  2379. ADDQ CX, R14
  2380. ADDQ CX, R9
  2381. copy_5_end:
  2382. ADDQ CX, R11
  2383. SUBQ CX, R13
  2384. // Copy match from the current buffer
  2385. copy_match:
  2386. MOVQ R9, CX
  2387. SUBQ R12, CX
  2388. // ml <= mo
  2389. CMPQ R13, R12
  2390. JA copy_overlapping_match
  2391. // Copy non-overlapping match
  2392. ADDQ R13, R11
  2393. MOVQ R9, R12
  2394. ADDQ R13, R9
  2395. copy_2:
  2396. MOVUPS (CX), X0
  2397. MOVUPS X0, (R12)
  2398. ADDQ $0x10, CX
  2399. ADDQ $0x10, R12
  2400. SUBQ $0x10, R13
  2401. JHI copy_2
  2402. JMP handle_loop
  2403. // Copy overlapping match
  2404. copy_overlapping_match:
  2405. ADDQ R13, R11
  2406. copy_slow_3:
  2407. MOVB (CX), R12
  2408. MOVB R12, (R9)
  2409. INCQ CX
  2410. INCQ R9
  2411. DECQ R13
  2412. JNZ copy_slow_3
  2413. handle_loop:
  2414. MOVQ ctx+16(FP), CX
  2415. DECQ 96(CX)
  2416. JNS sequenceDecs_decodeSync_bmi2_main_loop
  2417. loop_finished:
  2418. MOVQ br+8(FP), CX
  2419. MOVQ AX, 32(CX)
  2420. MOVB DL, 40(CX)
  2421. MOVQ BX, 24(CX)
  2422. // Update the context
  2423. MOVQ ctx+16(FP), AX
  2424. MOVQ R11, 136(AX)
  2425. MOVQ 144(AX), CX
  2426. SUBQ CX, R10
  2427. MOVQ R10, 168(AX)
  2428. // Return success
  2429. MOVQ $0x00000000, ret+24(FP)
  2430. RET
  2431. // Return with match length error
  2432. sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
  2433. MOVQ 16(SP), AX
  2434. MOVQ ctx+16(FP), CX
  2435. MOVQ AX, 216(CX)
  2436. MOVQ $0x00000001, ret+24(FP)
  2437. RET
  2438. // Return with match too long error
  2439. sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
  2440. MOVQ ctx+16(FP), AX
  2441. MOVQ 16(SP), CX
  2442. MOVQ CX, 216(AX)
  2443. MOVQ $0x00000002, ret+24(FP)
  2444. RET
  2445. // Return with match offset too long error
  2446. error_match_off_too_big:
  2447. MOVQ ctx+16(FP), AX
  2448. MOVQ 8(SP), CX
  2449. MOVQ CX, 224(AX)
  2450. MOVQ R11, 136(AX)
  2451. MOVQ $0x00000003, ret+24(FP)
  2452. RET
  2453. // Return with not enough literals error
  2454. error_not_enough_literals:
  2455. MOVQ ctx+16(FP), AX
  2456. MOVQ 24(SP), CX
  2457. MOVQ CX, 208(AX)
  2458. MOVQ $0x00000004, ret+24(FP)
  2459. RET
  2460. // Return with not enough output space error
  2461. error_not_enough_space:
  2462. MOVQ ctx+16(FP), AX
  2463. MOVQ 24(SP), CX
  2464. MOVQ CX, 208(AX)
  2465. MOVQ 16(SP), CX
  2466. MOVQ CX, 216(AX)
  2467. MOVQ R11, 136(AX)
  2468. MOVQ $0x00000005, ret+24(FP)
  2469. RET
  2470. // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2471. // Requires: CMOV, SSE
  2472. TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
  2473. MOVQ br+8(FP), AX
  2474. MOVQ 32(AX), DX
  2475. MOVBQZX 40(AX), BX
  2476. MOVQ 24(AX), SI
  2477. MOVQ (AX), AX
  2478. ADDQ SI, AX
  2479. MOVQ AX, (SP)
  2480. MOVQ ctx+16(FP), AX
  2481. MOVQ 72(AX), DI
  2482. MOVQ 80(AX), R8
  2483. MOVQ 88(AX), R9
  2484. XORQ CX, CX
  2485. MOVQ CX, 8(SP)
  2486. MOVQ CX, 16(SP)
  2487. MOVQ CX, 24(SP)
  2488. MOVQ 112(AX), R10
  2489. MOVQ 128(AX), CX
  2490. MOVQ CX, 32(SP)
  2491. MOVQ 144(AX), R11
  2492. MOVQ 136(AX), R12
  2493. MOVQ 200(AX), CX
  2494. MOVQ CX, 56(SP)
  2495. MOVQ 176(AX), CX
  2496. MOVQ CX, 48(SP)
  2497. MOVQ 184(AX), AX
  2498. MOVQ AX, 40(SP)
  2499. MOVQ 40(SP), AX
  2500. ADDQ AX, 48(SP)
  2501. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2502. ADDQ R10, 32(SP)
  2503. // outBase += outPosition
  2504. ADDQ R12, R10
  2505. sequenceDecs_decodeSync_safe_amd64_main_loop:
  2506. MOVQ (SP), R13
  2507. // Fill bitreader to have enough for the offset and match length.
  2508. CMPQ SI, $0x08
  2509. JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2510. MOVQ BX, AX
  2511. SHRQ $0x03, AX
  2512. SUBQ AX, R13
  2513. MOVQ (R13), DX
  2514. SUBQ AX, SI
  2515. ANDQ $0x07, BX
  2516. JMP sequenceDecs_decodeSync_safe_amd64_fill_end
  2517. sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
  2518. CMPQ SI, $0x00
  2519. JLE sequenceDecs_decodeSync_safe_amd64_fill_end
  2520. CMPQ BX, $0x07
  2521. JLE sequenceDecs_decodeSync_safe_amd64_fill_end
  2522. SHLQ $0x08, DX
  2523. SUBQ $0x01, R13
  2524. SUBQ $0x01, SI
  2525. SUBQ $0x08, BX
  2526. MOVBQZX (R13), AX
  2527. ORQ AX, DX
  2528. JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2529. sequenceDecs_decodeSync_safe_amd64_fill_end:
  2530. // Update offset
  2531. MOVQ R9, AX
  2532. MOVQ BX, CX
  2533. MOVQ DX, R14
  2534. SHLQ CL, R14
  2535. MOVB AH, CL
  2536. SHRQ $0x20, AX
  2537. TESTQ CX, CX
  2538. JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2539. ADDQ CX, BX
  2540. CMPQ BX, $0x40
  2541. JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2542. CMPQ CX, $0x40
  2543. JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2544. NEGQ CX
  2545. SHRQ CL, R14
  2546. ADDQ R14, AX
  2547. sequenceDecs_decodeSync_safe_amd64_of_update_zero:
  2548. MOVQ AX, 8(SP)
  2549. // Update match length
  2550. MOVQ R8, AX
  2551. MOVQ BX, CX
  2552. MOVQ DX, R14
  2553. SHLQ CL, R14
  2554. MOVB AH, CL
  2555. SHRQ $0x20, AX
  2556. TESTQ CX, CX
  2557. JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2558. ADDQ CX, BX
  2559. CMPQ BX, $0x40
  2560. JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2561. CMPQ CX, $0x40
  2562. JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2563. NEGQ CX
  2564. SHRQ CL, R14
  2565. ADDQ R14, AX
  2566. sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
  2567. MOVQ AX, 16(SP)
  2568. // Fill bitreader to have enough for the remaining
  2569. CMPQ SI, $0x08
  2570. JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2571. MOVQ BX, AX
  2572. SHRQ $0x03, AX
  2573. SUBQ AX, R13
  2574. MOVQ (R13), DX
  2575. SUBQ AX, SI
  2576. ANDQ $0x07, BX
  2577. JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2578. sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
  2579. CMPQ SI, $0x00
  2580. JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2581. CMPQ BX, $0x07
  2582. JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2583. SHLQ $0x08, DX
  2584. SUBQ $0x01, R13
  2585. SUBQ $0x01, SI
  2586. SUBQ $0x08, BX
  2587. MOVBQZX (R13), AX
  2588. ORQ AX, DX
  2589. JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2590. sequenceDecs_decodeSync_safe_amd64_fill_2_end:
  2591. // Update literal length
  2592. MOVQ DI, AX
  2593. MOVQ BX, CX
  2594. MOVQ DX, R14
  2595. SHLQ CL, R14
  2596. MOVB AH, CL
  2597. SHRQ $0x20, AX
  2598. TESTQ CX, CX
  2599. JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2600. ADDQ CX, BX
  2601. CMPQ BX, $0x40
  2602. JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2603. CMPQ CX, $0x40
  2604. JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2605. NEGQ CX
  2606. SHRQ CL, R14
  2607. ADDQ R14, AX
  2608. sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
  2609. MOVQ AX, 24(SP)
  2610. // Fill bitreader for state updates
  2611. MOVQ R13, (SP)
  2612. MOVQ R9, AX
  2613. SHRQ $0x08, AX
  2614. MOVBQZX AL, AX
  2615. MOVQ ctx+16(FP), CX
  2616. CMPQ 96(CX), $0x00
  2617. JZ sequenceDecs_decodeSync_safe_amd64_skip_update
  2618. // Update Literal Length State
  2619. MOVBQZX DI, R13
  2620. SHRQ $0x10, DI
  2621. MOVWQZX DI, DI
  2622. LEAQ (BX)(R13*1), CX
  2623. MOVQ DX, R14
  2624. MOVQ CX, BX
  2625. ROLQ CL, R14
  2626. MOVL $0x00000001, R15
  2627. MOVB R13, CL
  2628. SHLL CL, R15
  2629. DECL R15
  2630. ANDQ R15, R14
  2631. ADDQ R14, DI
  2632. // Load ctx.llTable
  2633. MOVQ ctx+16(FP), CX
  2634. MOVQ (CX), CX
  2635. MOVQ (CX)(DI*8), DI
  2636. // Update Match Length State
  2637. MOVBQZX R8, R13
  2638. SHRQ $0x10, R8
  2639. MOVWQZX R8, R8
  2640. LEAQ (BX)(R13*1), CX
  2641. MOVQ DX, R14
  2642. MOVQ CX, BX
  2643. ROLQ CL, R14
  2644. MOVL $0x00000001, R15
  2645. MOVB R13, CL
  2646. SHLL CL, R15
  2647. DECL R15
  2648. ANDQ R15, R14
  2649. ADDQ R14, R8
  2650. // Load ctx.mlTable
  2651. MOVQ ctx+16(FP), CX
  2652. MOVQ 24(CX), CX
  2653. MOVQ (CX)(R8*8), R8
  2654. // Update Offset State
  2655. MOVBQZX R9, R13
  2656. SHRQ $0x10, R9
  2657. MOVWQZX R9, R9
  2658. LEAQ (BX)(R13*1), CX
  2659. MOVQ DX, R14
  2660. MOVQ CX, BX
  2661. ROLQ CL, R14
  2662. MOVL $0x00000001, R15
  2663. MOVB R13, CL
  2664. SHLL CL, R15
  2665. DECL R15
  2666. ANDQ R15, R14
  2667. ADDQ R14, R9
  2668. // Load ctx.ofTable
  2669. MOVQ ctx+16(FP), CX
  2670. MOVQ 48(CX), CX
  2671. MOVQ (CX)(R9*8), R9
  2672. sequenceDecs_decodeSync_safe_amd64_skip_update:
  2673. // Adjust offset
  2674. MOVQ s+0(FP), CX
  2675. MOVQ 8(SP), R13
  2676. CMPQ AX, $0x01
  2677. JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
  2678. MOVUPS 144(CX), X0
  2679. MOVQ R13, 144(CX)
  2680. MOVUPS X0, 152(CX)
  2681. JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
  2682. sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
  2683. CMPQ 24(SP), $0x00000000
  2684. JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
  2685. INCQ R13
  2686. JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  2687. sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
  2688. TESTQ R13, R13
  2689. JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  2690. MOVQ 144(CX), R13
  2691. JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
  2692. sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
  2693. MOVQ R13, AX
  2694. XORQ R14, R14
  2695. MOVQ $-1, R15
  2696. CMPQ R13, $0x03
  2697. CMOVQEQ R14, AX
  2698. CMOVQEQ R15, R14
  2699. ADDQ 144(CX)(AX*8), R14
  2700. JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
  2701. MOVQ $0x00000001, R14
  2702. sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
  2703. CMPQ R13, $0x01
  2704. JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
  2705. MOVQ 152(CX), AX
  2706. MOVQ AX, 160(CX)
  2707. sequenceDecs_decodeSync_safe_amd64_adjust_skip:
  2708. MOVQ 144(CX), AX
  2709. MOVQ AX, 152(CX)
  2710. MOVQ R14, 144(CX)
  2711. MOVQ R14, R13
  2712. sequenceDecs_decodeSync_safe_amd64_after_adjust:
  2713. MOVQ R13, 8(SP)
  2714. // Check values
  2715. MOVQ 16(SP), AX
  2716. MOVQ 24(SP), CX
  2717. LEAQ (AX)(CX*1), R14
  2718. MOVQ s+0(FP), R15
  2719. ADDQ R14, 256(R15)
  2720. MOVQ ctx+16(FP), R14
  2721. SUBQ CX, 104(R14)
  2722. JS error_not_enough_literals
  2723. CMPQ AX, $0x00020002
  2724. JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
  2725. TESTQ R13, R13
  2726. JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
  2727. TESTQ AX, AX
  2728. JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
  2729. sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
  2730. MOVQ 24(SP), AX
  2731. MOVQ 8(SP), CX
  2732. MOVQ 16(SP), R13
  2733. // Check if we have enough space in s.out
  2734. LEAQ (AX)(R13*1), R14
  2735. ADDQ R10, R14
  2736. CMPQ R14, 32(SP)
  2737. JA error_not_enough_space
  2738. // Copy literals
  2739. TESTQ AX, AX
  2740. JZ check_offset
  2741. MOVQ AX, R14
  2742. SUBQ $0x10, R14
  2743. JB copy_1_small
  2744. copy_1_loop:
  2745. MOVUPS (R11), X0
  2746. MOVUPS X0, (R10)
  2747. ADDQ $0x10, R11
  2748. ADDQ $0x10, R10
  2749. SUBQ $0x10, R14
  2750. JAE copy_1_loop
  2751. LEAQ 16(R11)(R14*1), R11
  2752. LEAQ 16(R10)(R14*1), R10
  2753. MOVUPS -16(R11), X0
  2754. MOVUPS X0, -16(R10)
  2755. JMP copy_1_end
  2756. copy_1_small:
  2757. CMPQ AX, $0x03
  2758. JE copy_1_move_3
  2759. JB copy_1_move_1or2
  2760. CMPQ AX, $0x08
  2761. JB copy_1_move_4through7
  2762. JMP copy_1_move_8through16
  2763. copy_1_move_1or2:
  2764. MOVB (R11), R14
  2765. MOVB -1(R11)(AX*1), R15
  2766. MOVB R14, (R10)
  2767. MOVB R15, -1(R10)(AX*1)
  2768. ADDQ AX, R11
  2769. ADDQ AX, R10
  2770. JMP copy_1_end
  2771. copy_1_move_3:
  2772. MOVW (R11), R14
  2773. MOVB 2(R11), R15
  2774. MOVW R14, (R10)
  2775. MOVB R15, 2(R10)
  2776. ADDQ AX, R11
  2777. ADDQ AX, R10
  2778. JMP copy_1_end
  2779. copy_1_move_4through7:
  2780. MOVL (R11), R14
  2781. MOVL -4(R11)(AX*1), R15
  2782. MOVL R14, (R10)
  2783. MOVL R15, -4(R10)(AX*1)
  2784. ADDQ AX, R11
  2785. ADDQ AX, R10
  2786. JMP copy_1_end
  2787. copy_1_move_8through16:
  2788. MOVQ (R11), R14
  2789. MOVQ -8(R11)(AX*1), R15
  2790. MOVQ R14, (R10)
  2791. MOVQ R15, -8(R10)(AX*1)
  2792. ADDQ AX, R11
  2793. ADDQ AX, R10
  2794. copy_1_end:
  2795. ADDQ AX, R12
  2796. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2797. check_offset:
  2798. MOVQ R12, AX
  2799. ADDQ 40(SP), AX
  2800. CMPQ CX, AX
  2801. JG error_match_off_too_big
  2802. CMPQ CX, 56(SP)
  2803. JG error_match_off_too_big
  2804. // Copy match from history
  2805. MOVQ CX, AX
  2806. SUBQ R12, AX
  2807. JLS copy_match
  2808. MOVQ 48(SP), R14
  2809. SUBQ AX, R14
  2810. CMPQ R13, AX
  2811. JG copy_all_from_history
  2812. MOVQ R13, AX
  2813. SUBQ $0x10, AX
  2814. JB copy_4_small
  2815. copy_4_loop:
  2816. MOVUPS (R14), X0
  2817. MOVUPS X0, (R10)
  2818. ADDQ $0x10, R14
  2819. ADDQ $0x10, R10
  2820. SUBQ $0x10, AX
  2821. JAE copy_4_loop
  2822. LEAQ 16(R14)(AX*1), R14
  2823. LEAQ 16(R10)(AX*1), R10
  2824. MOVUPS -16(R14), X0
  2825. MOVUPS X0, -16(R10)
  2826. JMP copy_4_end
  2827. copy_4_small:
  2828. CMPQ R13, $0x03
  2829. JE copy_4_move_3
  2830. CMPQ R13, $0x08
  2831. JB copy_4_move_4through7
  2832. JMP copy_4_move_8through16
  2833. copy_4_move_3:
  2834. MOVW (R14), AX
  2835. MOVB 2(R14), CL
  2836. MOVW AX, (R10)
  2837. MOVB CL, 2(R10)
  2838. ADDQ R13, R14
  2839. ADDQ R13, R10
  2840. JMP copy_4_end
  2841. copy_4_move_4through7:
  2842. MOVL (R14), AX
  2843. MOVL -4(R14)(R13*1), CX
  2844. MOVL AX, (R10)
  2845. MOVL CX, -4(R10)(R13*1)
  2846. ADDQ R13, R14
  2847. ADDQ R13, R10
  2848. JMP copy_4_end
  2849. copy_4_move_8through16:
  2850. MOVQ (R14), AX
  2851. MOVQ -8(R14)(R13*1), CX
  2852. MOVQ AX, (R10)
  2853. MOVQ CX, -8(R10)(R13*1)
  2854. ADDQ R13, R14
  2855. ADDQ R13, R10
  2856. copy_4_end:
  2857. ADDQ R13, R12
  2858. JMP handle_loop
  2859. JMP loop_finished
  2860. copy_all_from_history:
  2861. MOVQ AX, R15
  2862. SUBQ $0x10, R15
  2863. JB copy_5_small
  2864. copy_5_loop:
  2865. MOVUPS (R14), X0
  2866. MOVUPS X0, (R10)
  2867. ADDQ $0x10, R14
  2868. ADDQ $0x10, R10
  2869. SUBQ $0x10, R15
  2870. JAE copy_5_loop
  2871. LEAQ 16(R14)(R15*1), R14
  2872. LEAQ 16(R10)(R15*1), R10
  2873. MOVUPS -16(R14), X0
  2874. MOVUPS X0, -16(R10)
  2875. JMP copy_5_end
  2876. copy_5_small:
  2877. CMPQ AX, $0x03
  2878. JE copy_5_move_3
  2879. JB copy_5_move_1or2
  2880. CMPQ AX, $0x08
  2881. JB copy_5_move_4through7
  2882. JMP copy_5_move_8through16
  2883. copy_5_move_1or2:
  2884. MOVB (R14), R15
  2885. MOVB -1(R14)(AX*1), BP
  2886. MOVB R15, (R10)
  2887. MOVB BP, -1(R10)(AX*1)
  2888. ADDQ AX, R14
  2889. ADDQ AX, R10
  2890. JMP copy_5_end
  2891. copy_5_move_3:
  2892. MOVW (R14), R15
  2893. MOVB 2(R14), BP
  2894. MOVW R15, (R10)
  2895. MOVB BP, 2(R10)
  2896. ADDQ AX, R14
  2897. ADDQ AX, R10
  2898. JMP copy_5_end
  2899. copy_5_move_4through7:
  2900. MOVL (R14), R15
  2901. MOVL -4(R14)(AX*1), BP
  2902. MOVL R15, (R10)
  2903. MOVL BP, -4(R10)(AX*1)
  2904. ADDQ AX, R14
  2905. ADDQ AX, R10
  2906. JMP copy_5_end
  2907. copy_5_move_8through16:
  2908. MOVQ (R14), R15
  2909. MOVQ -8(R14)(AX*1), BP
  2910. MOVQ R15, (R10)
  2911. MOVQ BP, -8(R10)(AX*1)
  2912. ADDQ AX, R14
  2913. ADDQ AX, R10
  2914. copy_5_end:
  2915. ADDQ AX, R12
  2916. SUBQ AX, R13
  2917. // Copy match from the current buffer
  2918. copy_match:
  2919. MOVQ R10, AX
  2920. SUBQ CX, AX
  2921. // ml <= mo
  2922. CMPQ R13, CX
  2923. JA copy_overlapping_match
  2924. // Copy non-overlapping match
  2925. ADDQ R13, R12
  2926. MOVQ R13, CX
  2927. SUBQ $0x10, CX
  2928. JB copy_2_small
  2929. copy_2_loop:
  2930. MOVUPS (AX), X0
  2931. MOVUPS X0, (R10)
  2932. ADDQ $0x10, AX
  2933. ADDQ $0x10, R10
  2934. SUBQ $0x10, CX
  2935. JAE copy_2_loop
  2936. LEAQ 16(AX)(CX*1), AX
  2937. LEAQ 16(R10)(CX*1), R10
  2938. MOVUPS -16(AX), X0
  2939. MOVUPS X0, -16(R10)
  2940. JMP copy_2_end
  2941. copy_2_small:
  2942. CMPQ R13, $0x03
  2943. JE copy_2_move_3
  2944. JB copy_2_move_1or2
  2945. CMPQ R13, $0x08
  2946. JB copy_2_move_4through7
  2947. JMP copy_2_move_8through16
  2948. copy_2_move_1or2:
  2949. MOVB (AX), CL
  2950. MOVB -1(AX)(R13*1), R14
  2951. MOVB CL, (R10)
  2952. MOVB R14, -1(R10)(R13*1)
  2953. ADDQ R13, AX
  2954. ADDQ R13, R10
  2955. JMP copy_2_end
  2956. copy_2_move_3:
  2957. MOVW (AX), CX
  2958. MOVB 2(AX), R14
  2959. MOVW CX, (R10)
  2960. MOVB R14, 2(R10)
  2961. ADDQ R13, AX
  2962. ADDQ R13, R10
  2963. JMP copy_2_end
  2964. copy_2_move_4through7:
  2965. MOVL (AX), CX
  2966. MOVL -4(AX)(R13*1), R14
  2967. MOVL CX, (R10)
  2968. MOVL R14, -4(R10)(R13*1)
  2969. ADDQ R13, AX
  2970. ADDQ R13, R10
  2971. JMP copy_2_end
  2972. copy_2_move_8through16:
  2973. MOVQ (AX), CX
  2974. MOVQ -8(AX)(R13*1), R14
  2975. MOVQ CX, (R10)
  2976. MOVQ R14, -8(R10)(R13*1)
  2977. ADDQ R13, AX
  2978. ADDQ R13, R10
  2979. copy_2_end:
  2980. JMP handle_loop
  2981. // Copy overlapping match
  2982. copy_overlapping_match:
  2983. ADDQ R13, R12
  2984. copy_slow_3:
  2985. MOVB (AX), CL
  2986. MOVB CL, (R10)
  2987. INCQ AX
  2988. INCQ R10
  2989. DECQ R13
  2990. JNZ copy_slow_3
  2991. handle_loop:
  2992. MOVQ ctx+16(FP), AX
  2993. DECQ 96(AX)
  2994. JNS sequenceDecs_decodeSync_safe_amd64_main_loop
  2995. loop_finished:
  2996. MOVQ br+8(FP), AX
  2997. MOVQ DX, 32(AX)
  2998. MOVB BL, 40(AX)
  2999. MOVQ SI, 24(AX)
  3000. // Update the context
  3001. MOVQ ctx+16(FP), AX
  3002. MOVQ R12, 136(AX)
  3003. MOVQ 144(AX), CX
  3004. SUBQ CX, R11
  3005. MOVQ R11, 168(AX)
  3006. // Return success
  3007. MOVQ $0x00000000, ret+24(FP)
  3008. RET
  3009. // Return with match length error
  3010. sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
  3011. MOVQ 16(SP), AX
  3012. MOVQ ctx+16(FP), CX
  3013. MOVQ AX, 216(CX)
  3014. MOVQ $0x00000001, ret+24(FP)
  3015. RET
  3016. // Return with match too long error
  3017. sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
  3018. MOVQ ctx+16(FP), AX
  3019. MOVQ 16(SP), CX
  3020. MOVQ CX, 216(AX)
  3021. MOVQ $0x00000002, ret+24(FP)
  3022. RET
  3023. // Return with match offset too long error
  3024. error_match_off_too_big:
  3025. MOVQ ctx+16(FP), AX
  3026. MOVQ 8(SP), CX
  3027. MOVQ CX, 224(AX)
  3028. MOVQ R12, 136(AX)
  3029. MOVQ $0x00000003, ret+24(FP)
  3030. RET
  3031. // Return with not enough literals error
  3032. error_not_enough_literals:
  3033. MOVQ ctx+16(FP), AX
  3034. MOVQ 24(SP), CX
  3035. MOVQ CX, 208(AX)
  3036. MOVQ $0x00000004, ret+24(FP)
  3037. RET
  3038. // Return with not enough output space error
  3039. error_not_enough_space:
  3040. MOVQ ctx+16(FP), AX
  3041. MOVQ 24(SP), CX
  3042. MOVQ CX, 208(AX)
  3043. MOVQ 16(SP), CX
  3044. MOVQ CX, 216(AX)
  3045. MOVQ R12, 136(AX)
  3046. MOVQ $0x00000005, ret+24(FP)
  3047. RET
  3048. // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  3049. // Requires: BMI, BMI2, CMOV, SSE
  3050. TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
  3051. MOVQ br+8(FP), CX
  3052. MOVQ 32(CX), AX
  3053. MOVBQZX 40(CX), DX
  3054. MOVQ 24(CX), BX
  3055. MOVQ (CX), CX
  3056. ADDQ BX, CX
  3057. MOVQ CX, (SP)
  3058. MOVQ ctx+16(FP), CX
  3059. MOVQ 72(CX), SI
  3060. MOVQ 80(CX), DI
  3061. MOVQ 88(CX), R8
  3062. XORQ R9, R9
  3063. MOVQ R9, 8(SP)
  3064. MOVQ R9, 16(SP)
  3065. MOVQ R9, 24(SP)
  3066. MOVQ 112(CX), R9
  3067. MOVQ 128(CX), R10
  3068. MOVQ R10, 32(SP)
  3069. MOVQ 144(CX), R10
  3070. MOVQ 136(CX), R11
  3071. MOVQ 200(CX), R12
  3072. MOVQ R12, 56(SP)
  3073. MOVQ 176(CX), R12
  3074. MOVQ R12, 48(SP)
  3075. MOVQ 184(CX), CX
  3076. MOVQ CX, 40(SP)
  3077. MOVQ 40(SP), CX
  3078. ADDQ CX, 48(SP)
  3079. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  3080. ADDQ R9, 32(SP)
  3081. // outBase += outPosition
  3082. ADDQ R11, R9
  3083. sequenceDecs_decodeSync_safe_bmi2_main_loop:
  3084. MOVQ (SP), R12
  3085. // Fill bitreader to have enough for the offset and match length.
  3086. CMPQ BX, $0x08
  3087. JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3088. MOVQ DX, CX
  3089. SHRQ $0x03, CX
  3090. SUBQ CX, R12
  3091. MOVQ (R12), AX
  3092. SUBQ CX, BX
  3093. ANDQ $0x07, DX
  3094. JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
  3095. sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
  3096. CMPQ BX, $0x00
  3097. JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
  3098. CMPQ DX, $0x07
  3099. JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
  3100. SHLQ $0x08, AX
  3101. SUBQ $0x01, R12
  3102. SUBQ $0x01, BX
  3103. SUBQ $0x08, DX
  3104. MOVBQZX (R12), CX
  3105. ORQ CX, AX
  3106. JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3107. sequenceDecs_decodeSync_safe_bmi2_fill_end:
  3108. // Update offset
  3109. MOVQ $0x00000808, CX
  3110. BEXTRQ CX, R8, R13
  3111. MOVQ AX, R14
  3112. LEAQ (DX)(R13*1), CX
  3113. ROLQ CL, R14
  3114. BZHIQ R13, R14, R14
  3115. MOVQ CX, DX
  3116. MOVQ R8, CX
  3117. SHRQ $0x20, CX
  3118. ADDQ R14, CX
  3119. MOVQ CX, 8(SP)
  3120. // Update match length
  3121. MOVQ $0x00000808, CX
  3122. BEXTRQ CX, DI, R13
  3123. MOVQ AX, R14
  3124. LEAQ (DX)(R13*1), CX
  3125. ROLQ CL, R14
  3126. BZHIQ R13, R14, R14
  3127. MOVQ CX, DX
  3128. MOVQ DI, CX
  3129. SHRQ $0x20, CX
  3130. ADDQ R14, CX
  3131. MOVQ CX, 16(SP)
  3132. // Fill bitreader to have enough for the remaining
  3133. CMPQ BX, $0x08
  3134. JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3135. MOVQ DX, CX
  3136. SHRQ $0x03, CX
  3137. SUBQ CX, R12
  3138. MOVQ (R12), AX
  3139. SUBQ CX, BX
  3140. ANDQ $0x07, DX
  3141. JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3142. sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
  3143. CMPQ BX, $0x00
  3144. JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3145. CMPQ DX, $0x07
  3146. JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3147. SHLQ $0x08, AX
  3148. SUBQ $0x01, R12
  3149. SUBQ $0x01, BX
  3150. SUBQ $0x08, DX
  3151. MOVBQZX (R12), CX
  3152. ORQ CX, AX
  3153. JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3154. sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
  3155. // Update literal length
  3156. MOVQ $0x00000808, CX
  3157. BEXTRQ CX, SI, R13
  3158. MOVQ AX, R14
  3159. LEAQ (DX)(R13*1), CX
  3160. ROLQ CL, R14
  3161. BZHIQ R13, R14, R14
  3162. MOVQ CX, DX
  3163. MOVQ SI, CX
  3164. SHRQ $0x20, CX
  3165. ADDQ R14, CX
  3166. MOVQ CX, 24(SP)
  3167. // Fill bitreader for state updates
  3168. MOVQ R12, (SP)
  3169. MOVQ $0x00000808, CX
  3170. BEXTRQ CX, R8, R12
  3171. MOVQ ctx+16(FP), CX
  3172. CMPQ 96(CX), $0x00
  3173. JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
  3174. LEAQ (SI)(DI*1), R13
  3175. ADDQ R8, R13
  3176. MOVBQZX R13, R13
  3177. LEAQ (DX)(R13*1), CX
  3178. MOVQ AX, R14
  3179. MOVQ CX, DX
  3180. ROLQ CL, R14
  3181. BZHIQ R13, R14, R14
  3182. // Update Offset State
  3183. BZHIQ R8, R14, CX
  3184. SHRXQ R8, R14, R14
  3185. MOVQ $0x00001010, R13
  3186. BEXTRQ R13, R8, R8
  3187. ADDQ CX, R8
  3188. // Load ctx.ofTable
  3189. MOVQ ctx+16(FP), CX
  3190. MOVQ 48(CX), CX
  3191. MOVQ (CX)(R8*8), R8
  3192. // Update Match Length State
  3193. BZHIQ DI, R14, CX
  3194. SHRXQ DI, R14, R14
  3195. MOVQ $0x00001010, R13
  3196. BEXTRQ R13, DI, DI
  3197. ADDQ CX, DI
  3198. // Load ctx.mlTable
  3199. MOVQ ctx+16(FP), CX
  3200. MOVQ 24(CX), CX
  3201. MOVQ (CX)(DI*8), DI
  3202. // Update Literal Length State
  3203. BZHIQ SI, R14, CX
  3204. MOVQ $0x00001010, R13
  3205. BEXTRQ R13, SI, SI
  3206. ADDQ CX, SI
  3207. // Load ctx.llTable
  3208. MOVQ ctx+16(FP), CX
  3209. MOVQ (CX), CX
  3210. MOVQ (CX)(SI*8), SI
  3211. sequenceDecs_decodeSync_safe_bmi2_skip_update:
  3212. // Adjust offset
  3213. MOVQ s+0(FP), CX
  3214. MOVQ 8(SP), R13
  3215. CMPQ R12, $0x01
  3216. JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
  3217. MOVUPS 144(CX), X0
  3218. MOVQ R13, 144(CX)
  3219. MOVUPS X0, 152(CX)
  3220. JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3221. sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
  3222. CMPQ 24(SP), $0x00000000
  3223. JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
  3224. INCQ R13
  3225. JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3226. sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
  3227. TESTQ R13, R13
  3228. JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3229. MOVQ 144(CX), R13
  3230. JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3231. sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
  3232. MOVQ R13, R12
  3233. XORQ R14, R14
  3234. MOVQ $-1, R15
  3235. CMPQ R13, $0x03
  3236. CMOVQEQ R14, R12
  3237. CMOVQEQ R15, R14
  3238. ADDQ 144(CX)(R12*8), R14
  3239. JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
  3240. MOVQ $0x00000001, R14
  3241. sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
  3242. CMPQ R13, $0x01
  3243. JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
  3244. MOVQ 152(CX), R12
  3245. MOVQ R12, 160(CX)
  3246. sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
  3247. MOVQ 144(CX), R12
  3248. MOVQ R12, 152(CX)
  3249. MOVQ R14, 144(CX)
  3250. MOVQ R14, R13
  3251. sequenceDecs_decodeSync_safe_bmi2_after_adjust:
  3252. MOVQ R13, 8(SP)
  3253. // Check values
  3254. MOVQ 16(SP), CX
  3255. MOVQ 24(SP), R12
  3256. LEAQ (CX)(R12*1), R14
  3257. MOVQ s+0(FP), R15
  3258. ADDQ R14, 256(R15)
  3259. MOVQ ctx+16(FP), R14
  3260. SUBQ R12, 104(R14)
  3261. JS error_not_enough_literals
  3262. CMPQ CX, $0x00020002
  3263. JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
  3264. TESTQ R13, R13
  3265. JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
  3266. TESTQ CX, CX
  3267. JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
  3268. sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
  3269. MOVQ 24(SP), CX
  3270. MOVQ 8(SP), R12
  3271. MOVQ 16(SP), R13
  3272. // Check if we have enough space in s.out
  3273. LEAQ (CX)(R13*1), R14
  3274. ADDQ R9, R14
  3275. CMPQ R14, 32(SP)
  3276. JA error_not_enough_space
  3277. // Copy literals
  3278. TESTQ CX, CX
  3279. JZ check_offset
  3280. MOVQ CX, R14
  3281. SUBQ $0x10, R14
  3282. JB copy_1_small
  3283. copy_1_loop:
  3284. MOVUPS (R10), X0
  3285. MOVUPS X0, (R9)
  3286. ADDQ $0x10, R10
  3287. ADDQ $0x10, R9
  3288. SUBQ $0x10, R14
  3289. JAE copy_1_loop
  3290. LEAQ 16(R10)(R14*1), R10
  3291. LEAQ 16(R9)(R14*1), R9
  3292. MOVUPS -16(R10), X0
  3293. MOVUPS X0, -16(R9)
  3294. JMP copy_1_end
  3295. copy_1_small:
  3296. CMPQ CX, $0x03
  3297. JE copy_1_move_3
  3298. JB copy_1_move_1or2
  3299. CMPQ CX, $0x08
  3300. JB copy_1_move_4through7
  3301. JMP copy_1_move_8through16
  3302. copy_1_move_1or2:
  3303. MOVB (R10), R14
  3304. MOVB -1(R10)(CX*1), R15
  3305. MOVB R14, (R9)
  3306. MOVB R15, -1(R9)(CX*1)
  3307. ADDQ CX, R10
  3308. ADDQ CX, R9
  3309. JMP copy_1_end
  3310. copy_1_move_3:
  3311. MOVW (R10), R14
  3312. MOVB 2(R10), R15
  3313. MOVW R14, (R9)
  3314. MOVB R15, 2(R9)
  3315. ADDQ CX, R10
  3316. ADDQ CX, R9
  3317. JMP copy_1_end
  3318. copy_1_move_4through7:
  3319. MOVL (R10), R14
  3320. MOVL -4(R10)(CX*1), R15
  3321. MOVL R14, (R9)
  3322. MOVL R15, -4(R9)(CX*1)
  3323. ADDQ CX, R10
  3324. ADDQ CX, R9
  3325. JMP copy_1_end
  3326. copy_1_move_8through16:
  3327. MOVQ (R10), R14
  3328. MOVQ -8(R10)(CX*1), R15
  3329. MOVQ R14, (R9)
  3330. MOVQ R15, -8(R9)(CX*1)
  3331. ADDQ CX, R10
  3332. ADDQ CX, R9
  3333. copy_1_end:
  3334. ADDQ CX, R11
  3335. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3336. check_offset:
  3337. MOVQ R11, CX
  3338. ADDQ 40(SP), CX
  3339. CMPQ R12, CX
  3340. JG error_match_off_too_big
  3341. CMPQ R12, 56(SP)
  3342. JG error_match_off_too_big
  3343. // Copy match from history
  3344. MOVQ R12, CX
  3345. SUBQ R11, CX
  3346. JLS copy_match
  3347. MOVQ 48(SP), R14
  3348. SUBQ CX, R14
  3349. CMPQ R13, CX
  3350. JG copy_all_from_history
  3351. MOVQ R13, CX
  3352. SUBQ $0x10, CX
  3353. JB copy_4_small
  3354. copy_4_loop:
  3355. MOVUPS (R14), X0
  3356. MOVUPS X0, (R9)
  3357. ADDQ $0x10, R14
  3358. ADDQ $0x10, R9
  3359. SUBQ $0x10, CX
  3360. JAE copy_4_loop
  3361. LEAQ 16(R14)(CX*1), R14
  3362. LEAQ 16(R9)(CX*1), R9
  3363. MOVUPS -16(R14), X0
  3364. MOVUPS X0, -16(R9)
  3365. JMP copy_4_end
  3366. copy_4_small:
  3367. CMPQ R13, $0x03
  3368. JE copy_4_move_3
  3369. CMPQ R13, $0x08
  3370. JB copy_4_move_4through7
  3371. JMP copy_4_move_8through16
  3372. copy_4_move_3:
  3373. MOVW (R14), CX
  3374. MOVB 2(R14), R12
  3375. MOVW CX, (R9)
  3376. MOVB R12, 2(R9)
  3377. ADDQ R13, R14
  3378. ADDQ R13, R9
  3379. JMP copy_4_end
  3380. copy_4_move_4through7:
  3381. MOVL (R14), CX
  3382. MOVL -4(R14)(R13*1), R12
  3383. MOVL CX, (R9)
  3384. MOVL R12, -4(R9)(R13*1)
  3385. ADDQ R13, R14
  3386. ADDQ R13, R9
  3387. JMP copy_4_end
  3388. copy_4_move_8through16:
  3389. MOVQ (R14), CX
  3390. MOVQ -8(R14)(R13*1), R12
  3391. MOVQ CX, (R9)
  3392. MOVQ R12, -8(R9)(R13*1)
  3393. ADDQ R13, R14
  3394. ADDQ R13, R9
  3395. copy_4_end:
  3396. ADDQ R13, R11
  3397. JMP handle_loop
  3398. JMP loop_finished
  3399. copy_all_from_history:
  3400. MOVQ CX, R15
  3401. SUBQ $0x10, R15
  3402. JB copy_5_small
  3403. copy_5_loop:
  3404. MOVUPS (R14), X0
  3405. MOVUPS X0, (R9)
  3406. ADDQ $0x10, R14
  3407. ADDQ $0x10, R9
  3408. SUBQ $0x10, R15
  3409. JAE copy_5_loop
  3410. LEAQ 16(R14)(R15*1), R14
  3411. LEAQ 16(R9)(R15*1), R9
  3412. MOVUPS -16(R14), X0
  3413. MOVUPS X0, -16(R9)
  3414. JMP copy_5_end
  3415. copy_5_small:
  3416. CMPQ CX, $0x03
  3417. JE copy_5_move_3
  3418. JB copy_5_move_1or2
  3419. CMPQ CX, $0x08
  3420. JB copy_5_move_4through7
  3421. JMP copy_5_move_8through16
  3422. copy_5_move_1or2:
  3423. MOVB (R14), R15
  3424. MOVB -1(R14)(CX*1), BP
  3425. MOVB R15, (R9)
  3426. MOVB BP, -1(R9)(CX*1)
  3427. ADDQ CX, R14
  3428. ADDQ CX, R9
  3429. JMP copy_5_end
  3430. copy_5_move_3:
  3431. MOVW (R14), R15
  3432. MOVB 2(R14), BP
  3433. MOVW R15, (R9)
  3434. MOVB BP, 2(R9)
  3435. ADDQ CX, R14
  3436. ADDQ CX, R9
  3437. JMP copy_5_end
  3438. copy_5_move_4through7:
  3439. MOVL (R14), R15
  3440. MOVL -4(R14)(CX*1), BP
  3441. MOVL R15, (R9)
  3442. MOVL BP, -4(R9)(CX*1)
  3443. ADDQ CX, R14
  3444. ADDQ CX, R9
  3445. JMP copy_5_end
  3446. copy_5_move_8through16:
  3447. MOVQ (R14), R15
  3448. MOVQ -8(R14)(CX*1), BP
  3449. MOVQ R15, (R9)
  3450. MOVQ BP, -8(R9)(CX*1)
  3451. ADDQ CX, R14
  3452. ADDQ CX, R9
  3453. copy_5_end:
  3454. ADDQ CX, R11
  3455. SUBQ CX, R13
  3456. // Copy match from the current buffer
  3457. copy_match:
  3458. MOVQ R9, CX
  3459. SUBQ R12, CX
  3460. // ml <= mo
  3461. CMPQ R13, R12
  3462. JA copy_overlapping_match
  3463. // Copy non-overlapping match
  3464. ADDQ R13, R11
  3465. MOVQ R13, R12
  3466. SUBQ $0x10, R12
  3467. JB copy_2_small
  3468. copy_2_loop:
  3469. MOVUPS (CX), X0
  3470. MOVUPS X0, (R9)
  3471. ADDQ $0x10, CX
  3472. ADDQ $0x10, R9
  3473. SUBQ $0x10, R12
  3474. JAE copy_2_loop
  3475. LEAQ 16(CX)(R12*1), CX
  3476. LEAQ 16(R9)(R12*1), R9
  3477. MOVUPS -16(CX), X0
  3478. MOVUPS X0, -16(R9)
  3479. JMP copy_2_end
  3480. copy_2_small:
  3481. CMPQ R13, $0x03
  3482. JE copy_2_move_3
  3483. JB copy_2_move_1or2
  3484. CMPQ R13, $0x08
  3485. JB copy_2_move_4through7
  3486. JMP copy_2_move_8through16
  3487. copy_2_move_1or2:
  3488. MOVB (CX), R12
  3489. MOVB -1(CX)(R13*1), R14
  3490. MOVB R12, (R9)
  3491. MOVB R14, -1(R9)(R13*1)
  3492. ADDQ R13, CX
  3493. ADDQ R13, R9
  3494. JMP copy_2_end
  3495. copy_2_move_3:
  3496. MOVW (CX), R12
  3497. MOVB 2(CX), R14
  3498. MOVW R12, (R9)
  3499. MOVB R14, 2(R9)
  3500. ADDQ R13, CX
  3501. ADDQ R13, R9
  3502. JMP copy_2_end
  3503. copy_2_move_4through7:
  3504. MOVL (CX), R12
  3505. MOVL -4(CX)(R13*1), R14
  3506. MOVL R12, (R9)
  3507. MOVL R14, -4(R9)(R13*1)
  3508. ADDQ R13, CX
  3509. ADDQ R13, R9
  3510. JMP copy_2_end
  3511. copy_2_move_8through16:
  3512. MOVQ (CX), R12
  3513. MOVQ -8(CX)(R13*1), R14
  3514. MOVQ R12, (R9)
  3515. MOVQ R14, -8(R9)(R13*1)
  3516. ADDQ R13, CX
  3517. ADDQ R13, R9
  3518. copy_2_end:
  3519. JMP handle_loop
  3520. // Copy overlapping match
  3521. copy_overlapping_match:
  3522. ADDQ R13, R11
  3523. copy_slow_3:
  3524. MOVB (CX), R12
  3525. MOVB R12, (R9)
  3526. INCQ CX
  3527. INCQ R9
  3528. DECQ R13
  3529. JNZ copy_slow_3
  3530. handle_loop:
  3531. MOVQ ctx+16(FP), CX
  3532. DECQ 96(CX)
  3533. JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
  3534. loop_finished:
  3535. MOVQ br+8(FP), CX
  3536. MOVQ AX, 32(CX)
  3537. MOVB DL, 40(CX)
  3538. MOVQ BX, 24(CX)
  3539. // Update the context
  3540. MOVQ ctx+16(FP), AX
  3541. MOVQ R11, 136(AX)
  3542. MOVQ 144(AX), CX
  3543. SUBQ CX, R10
  3544. MOVQ R10, 168(AX)
  3545. // Return success
  3546. MOVQ $0x00000000, ret+24(FP)
  3547. RET
  3548. // Return with match length error
  3549. sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
  3550. MOVQ 16(SP), AX
  3551. MOVQ ctx+16(FP), CX
  3552. MOVQ AX, 216(CX)
  3553. MOVQ $0x00000001, ret+24(FP)
  3554. RET
  3555. // Return with match too long error
  3556. sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
  3557. MOVQ ctx+16(FP), AX
  3558. MOVQ 16(SP), CX
  3559. MOVQ CX, 216(AX)
  3560. MOVQ $0x00000002, ret+24(FP)
  3561. RET
  3562. // Return with match offset too long error
  3563. error_match_off_too_big:
  3564. MOVQ ctx+16(FP), AX
  3565. MOVQ 8(SP), CX
  3566. MOVQ CX, 224(AX)
  3567. MOVQ R11, 136(AX)
  3568. MOVQ $0x00000003, ret+24(FP)
  3569. RET
  3570. // Return with not enough literals error
  3571. error_not_enough_literals:
  3572. MOVQ ctx+16(FP), AX
  3573. MOVQ 24(SP), CX
  3574. MOVQ CX, 208(AX)
  3575. MOVQ $0x00000004, ret+24(FP)
  3576. RET
  3577. // Return with not enough output space error
  3578. error_not_enough_space:
  3579. MOVQ ctx+16(FP), AX
  3580. MOVQ 24(SP), CX
  3581. MOVQ CX, 208(AX)
  3582. MOVQ 16(SP), CX
  3583. MOVQ CX, 216(AX)
  3584. MOVQ R11, 136(AX)
  3585. MOVQ $0x00000005, ret+24(FP)
  3586. RET