/*
 * Decompiled with CFR 0.152.
 */
package org.graalvm.compiler.lir.amd64;

import jdk.vm.ci.amd64.AMD64;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.ValueUtil;
import jdk.vm.ci.meta.AllocatableValue;
import jdk.vm.ci.meta.Value;
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.amd64.AMD64Address;
import org.graalvm.compiler.asm.amd64.AMD64Assembler;
import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
import org.graalvm.compiler.asm.amd64.AVXKind;
import org.graalvm.compiler.core.common.Stride;
import org.graalvm.compiler.debug.GraalError;
import org.graalvm.compiler.lir.LIRInstruction;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.SyncPort;
import org.graalvm.compiler.lir.amd64.AMD64LIRHelper;
import org.graalvm.compiler.lir.amd64.AMD64LIRInstruction;
import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;

@SyncPort(from="https://github.com/openjdk/jdk/blob/0487aa61c67de695d008af4fe75c2a3072261a6f/src/hotspot/cpu/x86/macroAssembler_x86_sha.cpp#L496-L1035", sha1="f9283840deab5f199d600017cde5548f80ca0699")
public final class AMD64SHA256AVX2Op
extends AMD64LIRInstruction {
    public static final LIRInstructionClass<AMD64SHA256AVX2Op> TYPE = LIRInstructionClass.create(AMD64SHA256AVX2Op.class);
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value bufValue;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value stateValue;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG, LIRInstruction.OperandFlag.ILLEGAL})
    private Value ofsValue;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG, LIRInstruction.OperandFlag.ILLEGAL})
    private Value limitValue;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value[] temps;
    private final boolean multiBlock;
    static ArrayDataPointerConstant k256W = AMD64LIRHelper.pointerConstant(16, new int[]{1116352408, 1899447441, -1245643825, -373957723, 1116352408, 1899447441, -1245643825, -373957723, 961987163, 1508970993, -1841331548, -1424204075, 961987163, 1508970993, -1841331548, -1424204075, -670586216, 310598401, 607225278, 1426881987, -670586216, 310598401, 607225278, 1426881987, 1925078388, -2132889090, -1680079193, -1046744716, 1925078388, -2132889090, -1680079193, -1046744716, -459576895, -272742522, 264347078, 604807628, -459576895, -272742522, 264347078, 604807628, 770255983, 1249150122, 1555081692, 1996064986, 770255983, 1249150122, 1555081692, 1996064986, -1740746414, -1473132947, -1341970488, -1084653625, -1740746414, -1473132947, -1341970488, -1084653625, -958395405, -710438585, 113926993, 338241895, -958395405, -710438585, 113926993, 338241895, 666307205, 773529912, 1294757372, 1396182291, 666307205, 773529912, 1294757372, 1396182291, 1695183700, 1986661051, -2117940946, -1838011259, 1695183700, 1986661051, -2117940946, -1838011259, -1564481375, -1474664885, -1035236496, -949202525, -1564481375, -1474664885, -1035236496, -949202525, -778901479, -694614492, -200395387, 275423344, -778901479, -694614492, -200395387, 275423344, 430227734, 506948616, 659060556, 883997877, 430227734, 506948616, 659060556, 883997877, 958139571, 1322822218, 1537002063, 1747873779, 958139571, 1322822218, 1537002063, 1747873779, 1955562222, 2024104815, -2067236844, -1933114872, 1955562222, 2024104815, -2067236844, -1933114872, -1866530822, -1538233109, -1090935817, -965641998, -1866530822, -1538233109, -1090935817, -965641998});
    static ArrayDataPointerConstant pshuffleByteFlipMask = AMD64LIRHelper.pointerConstant(16, new long[]{289644378169868803L, 868365760874482187L, 289644378169868803L, 868365760874482187L});
    static ArrayDataPointerConstant shuf00BA = AMD64LIRHelper.pointerConstant(16, new long[]{795458214199165184L, -1L, 795458214199165184L, -1L});
    static ArrayDataPointerConstant shufDC00 = AMD64LIRHelper.pointerConstant(16, new long[]{-1L, 795458214199165184L, -1L, 795458214199165184L});
    static ArrayDataPointerConstant ymmMask = AMD64LIRHelper.pointerConstant(16, new long[]{0L, 0L, -1L, -1L});
    private static final int XFER_SIZE = 512;
    private static final int INP_END_SIZE = 8;
    private static final int INP_SIZE = 8;
    private static final int CTX_SIZE = 8;
    private static final int RSP_SIZE = 8;
    private static final int OFFSET_XFER = 0;
    private static final int OFFSET_INP_END = 512;
    private static final int OFFSET_INP = 520;
    private static final int OFFSET_CTX = 528;
    private static final int OFFSET_RSP = 536;
    private static final int STACK_SIZE = 544;

    public AMD64SHA256AVX2Op(AllocatableValue bufValue, AllocatableValue stateValue) {
        this(bufValue, stateValue, Value.ILLEGAL, Value.ILLEGAL, false);
    }

    public AMD64SHA256AVX2Op(AllocatableValue bufValue, AllocatableValue stateValue, AllocatableValue ofsValue, AllocatableValue limitValue, boolean multiBlock) {
        super((LIRInstructionClass<? extends AMD64LIRInstruction>)TYPE);
        GraalError.guarantee(ValueUtil.asRegister((Value)bufValue).equals((Object)AMD64.rdi), "expect bufValue at rdi, but was %s", (Object)bufValue);
        GraalError.guarantee(ValueUtil.asRegister((Value)stateValue).equals((Object)AMD64.rsi), "expect stateValue at rsi, but was %s", (Object)stateValue);
        GraalError.guarantee(!multiBlock || ValueUtil.asRegister((Value)ofsValue).equals((Object)AMD64.rdx), "expect ofsValue at rdx, but was %s", (Object)ofsValue);
        GraalError.guarantee(!multiBlock || ValueUtil.asRegister((Value)limitValue).equals((Object)AMD64.rcx), "expect limitValue at rdx, but was %s", (Object)limitValue);
        this.bufValue = bufValue;
        this.stateValue = stateValue;
        this.ofsValue = ofsValue;
        this.limitValue = limitValue;
        this.multiBlock = multiBlock;
        this.temps = new Value[]{AMD64.rax.asValue(), AMD64.rcx.asValue(), AMD64.rdx.asValue(), AMD64.rsi.asValue(), AMD64.rdi.asValue(), AMD64.r8.asValue(), AMD64.r9.asValue(), AMD64.r10.asValue(), AMD64.r11.asValue(), AMD64.xmm0.asValue(), AMD64.xmm1.asValue(), AMD64.xmm2.asValue(), AMD64.xmm3.asValue(), AMD64.xmm4.asValue(), AMD64.xmm5.asValue(), AMD64.xmm6.asValue(), AMD64.xmm7.asValue(), AMD64.xmm8.asValue(), AMD64.xmm9.asValue(), AMD64.xmm10.asValue(), AMD64.xmm11.asValue(), AMD64.xmm12.asValue(), AMD64.xmm13.asValue()};
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
        Label labelLoop0 = new Label();
        Label labelLoop1 = new Label();
        Label labelLoop2 = new Label();
        Label labelLoop3 = new Label();
        Label labelLastBlockEnter = new Label();
        Label labelDoLastBlock = new Label();
        Label labelOnlyOneBlock = new Label();
        Label labelDoneHash = new Label();
        Label labelComputeSize = new Label();
        Label labelComputeSizeEnd = new Label();
        Label labelComputeSize1 = new Label();
        Label labelComputeSizeEnd1 = new Label();
        Register regSHUF00BA = AMD64.xmm10;
        Register regSHUFDC00 = AMD64.xmm12;
        Register regByteFlipMask = AMD64.xmm13;
        Register regNumBlks = AMD64.r8;
        Register regCTX = AMD64.rdx;
        Register regINP = AMD64.rcx;
        Register c = AMD64.rdi;
        Register d = AMD64.rsi;
        Register e = AMD64.r8;
        Register regTBL = AMD64.rbp;
        Register regSRND = regCTX;
        Register a = AMD64.rax;
        Register b = AMD64.rbx;
        Register f = AMD64.r9;
        Register g = AMD64.r10;
        Register h = AMD64.r11;
        masm.push(AMD64.rcx);
        masm.push(AMD64.rdx);
        masm.push(AMD64.rbx);
        masm.push(AMD64.rbp);
        masm.push(AMD64.r12);
        masm.push(AMD64.r13);
        masm.push(AMD64.r14);
        masm.push(AMD64.r15);
        masm.movq(AMD64.rax, AMD64.rsp);
        masm.subq(AMD64.rsp, 544);
        masm.andq(AMD64.rsp, -32);
        masm.movq(new AMD64Address(AMD64.rsp, 536), AMD64.rax);
        masm.movq(AMD64.r9, AMD64.rcx);
        masm.movq(AMD64.r8, AMD64.rdx);
        masm.movq(AMD64.rdx, AMD64.rsi);
        masm.movq(AMD64.rcx, AMD64.rdi);
        masm.leaq(regINP, new AMD64Address(AMD64.rcx, 0));
        masm.movq(regCTX, AMD64.rdx);
        if (this.multiBlock) {
            masm.xorq(AMD64.rax, AMD64.rax);
            masm.bind(labelComputeSize);
            masm.cmpqAndJcc(AMD64.r8, AMD64.r9, AMD64Assembler.ConditionFlag.AboveEqual, labelComputeSizeEnd, true);
            masm.addq(AMD64.r8, 64);
            masm.addq(AMD64.rax, 64);
            masm.jmpb(labelComputeSize);
            masm.bind(labelComputeSizeEnd);
            masm.movq(regNumBlks, AMD64.rax);
            masm.cmpqAndJcc(regNumBlks, 0, AMD64Assembler.ConditionFlag.Equal, labelDoneHash, false);
        } else {
            masm.xorq(regNumBlks, regNumBlks);
            masm.addq(regNumBlks, 64);
        }
        masm.leaq(regNumBlks, new AMD64Address(regINP, regNumBlks, Stride.S1, -64));
        masm.movq(new AMD64Address(AMD64.rsp, 512), regNumBlks);
        masm.cmpqAndJcc(regINP, regNumBlks, AMD64Assembler.ConditionFlag.Equal, labelOnlyOneBlock, false);
        masm.movl(a, new AMD64Address(regCTX, 0));
        masm.movl(b, new AMD64Address(regCTX, 4));
        masm.movl(c, new AMD64Address(regCTX, 8));
        masm.movl(d, new AMD64Address(regCTX, 12));
        masm.movl(e, new AMD64Address(regCTX, 16));
        masm.movl(f, new AMD64Address(regCTX, 20));
        masm.movl(h, new AMD64Address(regCTX, 28));
        masm.vmovdqu(regByteFlipMask, AMD64LIRHelper.recordExternalAddress(crb, pshuffleByteFlipMask));
        masm.vmovdqu(regSHUF00BA, AMD64LIRHelper.recordExternalAddress(crb, shuf00BA));
        masm.vmovdqu(regSHUFDC00, AMD64LIRHelper.recordExternalAddress(crb, shufDC00));
        masm.movl(g, new AMD64Address(regCTX, 24));
        masm.movq(new AMD64Address(AMD64.rsp, 528), regCTX);
        masm.bind(labelLoop0);
        masm.leaq(regTBL, AMD64LIRHelper.recordExternalAddress(crb, k256W));
        masm.vmovdqu(AMD64.xmm0, new AMD64Address(regINP, 0));
        masm.vmovdqu(AMD64.xmm1, new AMD64Address(regINP, 32));
        masm.vmovdqu(AMD64.xmm2, new AMD64Address(regINP, 64));
        masm.vmovdqu(AMD64.xmm3, new AMD64Address(regINP, 96));
        masm.vpshufb(AMD64.xmm0, AMD64.xmm0, regByteFlipMask, AVXKind.AVXSize.YMM);
        masm.vpshufb(AMD64.xmm1, AMD64.xmm1, regByteFlipMask, AVXKind.AVXSize.YMM);
        masm.vpshufb(AMD64.xmm2, AMD64.xmm2, regByteFlipMask, AVXKind.AVXSize.YMM);
        masm.vpshufb(AMD64.xmm3, AMD64.xmm3, regByteFlipMask, AVXKind.AVXSize.YMM);
        masm.vperm2i128(AMD64.xmm4, AMD64.xmm0, AMD64.xmm2, 32);
        masm.vperm2i128(AMD64.xmm5, AMD64.xmm0, AMD64.xmm2, 49);
        masm.vperm2i128(AMD64.xmm6, AMD64.xmm1, AMD64.xmm3, 32);
        masm.vperm2i128(AMD64.xmm7, AMD64.xmm1, AMD64.xmm3, 49);
        masm.bind(labelLastBlockEnter);
        masm.addq(regINP, 64);
        masm.movq(new AMD64Address(AMD64.rsp, 520), regINP);
        masm.xorq(regSRND, regSRND);
        masm.align(16);
        masm.bind(labelLoop1);
        masm.vpaddd(AMD64.xmm9, AMD64.xmm4, new AMD64Address(regTBL, regSRND, Stride.S1, 0), AVXKind.AVXSize.YMM);
        masm.vmovdqu(new AMD64Address(AMD64.rsp, regSRND, Stride.S1, 0), AMD64.xmm9);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm4, AMD64.xmm5, AMD64.xmm6, AMD64.xmm7, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, 0);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm4, AMD64.xmm5, AMD64.xmm6, AMD64.xmm7, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, 1);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm4, AMD64.xmm5, AMD64.xmm6, AMD64.xmm7, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, 2);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm4, AMD64.xmm5, AMD64.xmm6, AMD64.xmm7, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, 3);
        masm.vpaddd(AMD64.xmm9, AMD64.xmm5, new AMD64Address(regTBL, regSRND, Stride.S1, 32), AVXKind.AVXSize.YMM);
        masm.vmovdqu(new AMD64Address(AMD64.rsp, regSRND, Stride.S1, 32), AMD64.xmm9);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm5, AMD64.xmm6, AMD64.xmm7, AMD64.xmm4, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, 8);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm5, AMD64.xmm6, AMD64.xmm7, AMD64.xmm4, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, 9);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm5, AMD64.xmm6, AMD64.xmm7, AMD64.xmm4, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, 10);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm5, AMD64.xmm6, AMD64.xmm7, AMD64.xmm4, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, 11);
        masm.vpaddd(AMD64.xmm9, AMD64.xmm6, new AMD64Address(regTBL, regSRND, Stride.S1, 64), AVXKind.AVXSize.YMM);
        masm.vmovdqu(new AMD64Address(AMD64.rsp, regSRND, Stride.S1, 64), AMD64.xmm9);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm6, AMD64.xmm7, AMD64.xmm4, AMD64.xmm5, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, 16);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm6, AMD64.xmm7, AMD64.xmm4, AMD64.xmm5, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, 17);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm6, AMD64.xmm7, AMD64.xmm4, AMD64.xmm5, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, 18);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm6, AMD64.xmm7, AMD64.xmm4, AMD64.xmm5, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, 19);
        masm.vpaddd(AMD64.xmm9, AMD64.xmm7, new AMD64Address(regTBL, regSRND, Stride.S1, 96), AVXKind.AVXSize.YMM);
        masm.vmovdqu(new AMD64Address(AMD64.rsp, regSRND, Stride.S1, 96), AMD64.xmm9);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm7, AMD64.xmm4, AMD64.xmm5, AMD64.xmm6, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, 24);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm7, AMD64.xmm4, AMD64.xmm5, AMD64.xmm6, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, 25);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm7, AMD64.xmm4, AMD64.xmm5, AMD64.xmm6, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, 26);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundAndSched(masm, AMD64.xmm7, AMD64.xmm4, AMD64.xmm5, AMD64.xmm6, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, 27);
        masm.addq(regSRND, 128);
        masm.cmpqAndJcc(regSRND, 384, AMD64Assembler.ConditionFlag.Below, labelLoop1, false);
        masm.bind(labelLoop2);
        masm.vpaddd(AMD64.xmm9, AMD64.xmm4, new AMD64Address(regTBL, regSRND, Stride.S1, 0), AVXKind.AVXSize.YMM);
        masm.vmovdqu(new AMD64Address(AMD64.rsp, regSRND, Stride.S1, 0), AMD64.xmm9);
        AMD64SHA256AVX2Op.sha256AVX2FourRoundsComputeFirst(masm, 0);
        masm.vpaddd(AMD64.xmm9, AMD64.xmm5, new AMD64Address(regTBL, regSRND, Stride.S1, 32), AVXKind.AVXSize.YMM);
        masm.vmovdqu(new AMD64Address(AMD64.rsp, regSRND, Stride.S1, 32), AMD64.xmm9);
        AMD64SHA256AVX2Op.sha256AVX2FourRoundsComputeLast(masm, 8);
        masm.addq(regSRND, 64);
        masm.vmovdqu(AMD64.xmm4, AMD64.xmm6);
        masm.vmovdqu(AMD64.xmm5, AMD64.xmm7);
        masm.cmpqAndJcc(regSRND, 512, AMD64Assembler.ConditionFlag.Below, labelLoop2, false);
        masm.movq(regCTX, new AMD64Address(AMD64.rsp, 528));
        masm.movq(regINP, new AMD64Address(AMD64.rsp, 520));
        AMD64SHA256AVX2Op.addm(masm, 0, regCTX, a);
        AMD64SHA256AVX2Op.addm(masm, 4, regCTX, b);
        AMD64SHA256AVX2Op.addm(masm, 8, regCTX, c);
        AMD64SHA256AVX2Op.addm(masm, 12, regCTX, d);
        AMD64SHA256AVX2Op.addm(masm, 16, regCTX, e);
        AMD64SHA256AVX2Op.addm(masm, 20, regCTX, f);
        AMD64SHA256AVX2Op.addm(masm, 24, regCTX, g);
        AMD64SHA256AVX2Op.addm(masm, 28, regCTX, h);
        masm.cmpqAndJcc(regINP, new AMD64Address(AMD64.rsp, 512), AMD64Assembler.ConditionFlag.Above, labelDoneHash, false);
        masm.xorq(regSRND, regSRND);
        masm.align(16);
        masm.bind(labelLoop3);
        AMD64SHA256AVX2Op.sha256AVX2FourRoundsComputeFirst(masm, 4);
        AMD64SHA256AVX2Op.sha256AVX2FourRoundsComputeLast(masm, 12);
        masm.addq(regSRND, 64);
        masm.cmpqAndJcc(regSRND, 512, AMD64Assembler.ConditionFlag.Below, labelLoop3, false);
        masm.movq(regCTX, new AMD64Address(AMD64.rsp, 528));
        masm.movq(regINP, new AMD64Address(AMD64.rsp, 520));
        masm.addq(regINP, 64);
        AMD64SHA256AVX2Op.addm(masm, 0, regCTX, a);
        AMD64SHA256AVX2Op.addm(masm, 4, regCTX, b);
        AMD64SHA256AVX2Op.addm(masm, 8, regCTX, c);
        AMD64SHA256AVX2Op.addm(masm, 12, regCTX, d);
        AMD64SHA256AVX2Op.addm(masm, 16, regCTX, e);
        AMD64SHA256AVX2Op.addm(masm, 20, regCTX, f);
        AMD64SHA256AVX2Op.addm(masm, 24, regCTX, g);
        AMD64SHA256AVX2Op.addm(masm, 28, regCTX, h);
        masm.cmpqAndJcc(regINP, new AMD64Address(AMD64.rsp, 512), AMD64Assembler.ConditionFlag.Below, labelLoop0, false);
        masm.jccb(AMD64Assembler.ConditionFlag.Above, labelDoneHash);
        masm.bind(labelDoLastBlock);
        masm.leaq(regTBL, AMD64LIRHelper.recordExternalAddress(crb, k256W));
        masm.movdqu(AMD64.xmm4, new AMD64Address(regINP, 0));
        masm.movdqu(AMD64.xmm5, new AMD64Address(regINP, 16));
        masm.movdqu(AMD64.xmm6, new AMD64Address(regINP, 32));
        masm.movdqu(AMD64.xmm7, new AMD64Address(regINP, 48));
        masm.vpshufb(AMD64.xmm4, AMD64.xmm4, AMD64.xmm13, AVXKind.AVXSize.XMM);
        masm.vpshufb(AMD64.xmm5, AMD64.xmm5, AMD64.xmm13, AVXKind.AVXSize.XMM);
        masm.vpshufb(AMD64.xmm6, AMD64.xmm6, AMD64.xmm13, AVXKind.AVXSize.XMM);
        masm.vpshufb(AMD64.xmm7, AMD64.xmm7, AMD64.xmm13, AVXKind.AVXSize.XMM);
        masm.jmp(labelLastBlockEnter);
        masm.bind(labelOnlyOneBlock);
        masm.movl(a, new AMD64Address(regCTX, 0));
        masm.movl(b, new AMD64Address(regCTX, 4));
        masm.movl(c, new AMD64Address(regCTX, 8));
        masm.movl(d, new AMD64Address(regCTX, 12));
        masm.movl(e, new AMD64Address(regCTX, 16));
        masm.movl(f, new AMD64Address(regCTX, 20));
        masm.movl(h, new AMD64Address(regCTX, 28));
        masm.vmovdqu(regByteFlipMask, AMD64LIRHelper.recordExternalAddress(crb, pshuffleByteFlipMask));
        masm.vmovdqu(regSHUF00BA, AMD64LIRHelper.recordExternalAddress(crb, shuf00BA));
        masm.vmovdqu(regSHUFDC00, AMD64LIRHelper.recordExternalAddress(crb, shufDC00));
        masm.movl(g, new AMD64Address(regCTX, 24));
        masm.movq(new AMD64Address(AMD64.rsp, 528), regCTX);
        masm.jmpb(labelDoLastBlock);
        masm.bind(labelDoneHash);
        masm.movq(AMD64.rsp, new AMD64Address(AMD64.rsp, 536));
        masm.pop(AMD64.r15);
        masm.pop(AMD64.r14);
        masm.pop(AMD64.r13);
        masm.pop(AMD64.r12);
        masm.pop(AMD64.rbp);
        masm.pop(AMD64.rbx);
        masm.pop(AMD64.rdx);
        masm.pop(AMD64.rcx);
        if (this.multiBlock) {
            Register limitEnd = AMD64.rcx;
            Register ofsEnd = AMD64.rdx;
            masm.movq(AMD64.rax, ofsEnd);
            masm.bind(labelComputeSize1);
            masm.cmpqAndJcc(AMD64.rax, limitEnd, AMD64Assembler.ConditionFlag.AboveEqual, labelComputeSizeEnd1, true);
            masm.addq(AMD64.rax, 64);
            masm.jmpb(labelComputeSize1);
            masm.bind(labelComputeSizeEnd1);
        }
    }

    private static void sha256AVX2OneRoundCompute(AMD64MacroAssembler masm, Register regOldH, Register regA, Register regB, Register regC, Register regD, Register regE, Register regF, Register regG, Register regH, int iter) {
        Register regY0 = AMD64.r13;
        Register regY1 = AMD64.r14;
        Register regY2 = AMD64.r15;
        Register regY3 = AMD64.rcx;
        Register regT1 = AMD64.r12;
        if (iter % 4 > 0) {
            masm.addl(regOldH, regY2);
        }
        masm.movl(regY2, regF);
        masm.rorxl(regY0, regE, 25);
        masm.rorxl(regY1, regE, 11);
        masm.xorl(regY2, regG);
        masm.xorl(regY0, regY1);
        masm.rorxl(regY1, regE, 6);
        masm.andl(regY2, regE);
        if (iter % 4 > 0) {
            masm.addl(regOldH, regY3);
        }
        masm.xorl(regY0, regY1);
        masm.rorxl(regT1, regA, 13);
        masm.xorl(regY2, regG);
        masm.rorxl(regY1, regA, 22);
        masm.movl(regY3, regA);
        masm.xorl(regY1, regT1);
        masm.rorxl(regT1, regA, 2);
        masm.addl(regH, new AMD64Address(AMD64.rsp, AMD64.rdx, Stride.S1, 4 * iter));
        masm.orl(regY3, regC);
        masm.xorl(regY1, regT1);
        masm.movl(regT1, regA);
        masm.andl(regY3, regB);
        masm.andl(regT1, regC);
        masm.addl(regY2, regY0);
        masm.addl(regD, regH);
        masm.orl(regY3, regT1);
        masm.addl(regH, regY1);
        masm.addl(regD, regY2);
        if (iter % 4 == 3) {
            masm.addl(regH, regY2);
            masm.addl(regH, regY3);
        }
    }

    private static void sha256AVX2FourRoundsComputeFirst(AMD64MacroAssembler masm, int start) {
        AMD64SHA256AVX2Op.sha256AVX2OneRoundCompute(masm, AMD64.rax, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, start + 0);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundCompute(masm, AMD64.r11, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, start + 1);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundCompute(masm, AMD64.r10, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, start + 2);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundCompute(masm, AMD64.r9, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, start + 3);
    }

    private static void sha256AVX2FourRoundsComputeLast(AMD64MacroAssembler masm, int start) {
        AMD64SHA256AVX2Op.sha256AVX2OneRoundCompute(masm, AMD64.r8, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, AMD64.rsi, start + 0);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundCompute(masm, AMD64.rsi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, AMD64.rdi, start + 1);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundCompute(masm, AMD64.rdi, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, AMD64.rbx, start + 2);
        AMD64SHA256AVX2Op.sha256AVX2OneRoundCompute(masm, AMD64.rbx, AMD64.rbx, AMD64.rdi, AMD64.rsi, AMD64.r8, AMD64.r9, AMD64.r10, AMD64.r11, AMD64.rax, start + 3);
    }

    private static void sha256AVX2OneRoundAndSched(AMD64MacroAssembler masm, Register vector4, Register vector5, Register vector6, Register vector7, Register regA, Register regB, Register regC, Register regD, Register regE, Register regF, Register regG, Register regH, int iter) {
        masm.movl(AMD64.rcx, regA);
        masm.rorxl(AMD64.r13, regE, 25);
        masm.rorxl(AMD64.r14, regE, 11);
        masm.addl(regH, new AMD64Address(AMD64.rsp, AMD64.rdx, Stride.S1, 4 * iter));
        masm.orl(AMD64.rcx, regC);
        masm.movl(AMD64.r15, regF);
        masm.rorxl(AMD64.r12, regA, 13);
        masm.xorl(AMD64.r13, AMD64.r14);
        masm.xorl(AMD64.r15, regG);
        masm.rorxl(AMD64.r14, regE, 6);
        masm.andl(AMD64.r15, regE);
        masm.xorl(AMD64.r13, AMD64.r14);
        masm.rorxl(AMD64.r14, regA, 22);
        masm.addl(regD, regH);
        masm.andl(AMD64.rcx, regB);
        masm.xorl(AMD64.r14, AMD64.r12);
        masm.rorxl(AMD64.r12, regA, 2);
        masm.xorl(AMD64.r15, regG);
        masm.xorl(AMD64.r14, AMD64.r12);
        masm.movl(AMD64.r12, regA);
        masm.andl(AMD64.r12, regC);
        masm.addl(AMD64.r15, AMD64.r13);
        masm.orl(AMD64.rcx, AMD64.r12);
        masm.addl(regH, AMD64.r14);
        masm.addl(regD, AMD64.r15);
        masm.addl(regH, AMD64.r15);
        masm.addl(regH, AMD64.rcx);
        if (iter % 4 == 0) {
            masm.vpalignr(AMD64.xmm0, vector7, vector6, 4, AVXKind.AVXSize.YMM);
            masm.vpaddd(AMD64.xmm0, AMD64.xmm0, vector4, AVXKind.AVXSize.YMM);
            masm.vpalignr(AMD64.xmm1, vector5, vector4, 4, AVXKind.AVXSize.YMM);
            masm.vpsrld(AMD64.xmm2, AMD64.xmm1, 7, AVXKind.AVXSize.YMM);
            masm.vpslld(AMD64.xmm3, AMD64.xmm1, 25, AVXKind.AVXSize.YMM);
            masm.vpor(AMD64.xmm3, AMD64.xmm3, AMD64.xmm2, AVXKind.AVXSize.YMM);
            masm.vpsrld(AMD64.xmm2, AMD64.xmm1, 18, AVXKind.AVXSize.YMM);
        } else if (iter % 4 == 1) {
            masm.vpsrld(AMD64.xmm8, AMD64.xmm1, 3, AVXKind.AVXSize.YMM);
            masm.vpslld(AMD64.xmm1, AMD64.xmm1, 14, AVXKind.AVXSize.YMM);
            masm.vpxor(AMD64.xmm3, AMD64.xmm3, AMD64.xmm1, AVXKind.AVXSize.YMM);
            masm.vpxor(AMD64.xmm3, AMD64.xmm3, AMD64.xmm2, AVXKind.AVXSize.YMM);
            masm.vpxor(AMD64.xmm1, AMD64.xmm3, AMD64.xmm8, AVXKind.AVXSize.YMM);
            masm.vpshufd(AMD64.xmm2, vector7, 250, AVXKind.AVXSize.YMM);
            masm.vpaddd(AMD64.xmm0, AMD64.xmm0, AMD64.xmm1, AVXKind.AVXSize.YMM);
            masm.vpsrld(AMD64.xmm8, AMD64.xmm2, 10, AVXKind.AVXSize.YMM);
        } else if (iter % 4 == 2) {
            masm.vpsrlq(AMD64.xmm3, AMD64.xmm2, 19, AVXKind.AVXSize.YMM);
            masm.vpsrlq(AMD64.xmm2, AMD64.xmm2, 17, AVXKind.AVXSize.YMM);
            masm.vpxor(AMD64.xmm2, AMD64.xmm2, AMD64.xmm3, AVXKind.AVXSize.YMM);
            masm.vpxor(AMD64.xmm8, AMD64.xmm8, AMD64.xmm2, AVXKind.AVXSize.YMM);
            masm.vpshufb(AMD64.xmm8, AMD64.xmm8, AMD64.xmm10, AVXKind.AVXSize.YMM);
            masm.vpaddd(AMD64.xmm0, AMD64.xmm0, AMD64.xmm8, AVXKind.AVXSize.YMM);
            masm.vpshufd(AMD64.xmm2, AMD64.xmm0, 80, AVXKind.AVXSize.YMM);
        } else if (iter % 4 == 3) {
            masm.vpsrld(AMD64.xmm11, AMD64.xmm2, 10, AVXKind.AVXSize.YMM);
            masm.vpsrlq(AMD64.xmm3, AMD64.xmm2, 19, AVXKind.AVXSize.YMM);
            masm.vpsrlq(AMD64.xmm2, AMD64.xmm2, 17, AVXKind.AVXSize.YMM);
            masm.vpxor(AMD64.xmm2, AMD64.xmm2, AMD64.xmm3, AVXKind.AVXSize.YMM);
            masm.vpxor(AMD64.xmm11, AMD64.xmm11, AMD64.xmm2, AVXKind.AVXSize.YMM);
            masm.vpshufb(AMD64.xmm11, AMD64.xmm11, AMD64.xmm12, AVXKind.AVXSize.YMM);
            masm.vpaddd(vector4, AMD64.xmm11, AMD64.xmm0, AVXKind.AVXSize.YMM);
        }
    }

    private static void addm(AMD64MacroAssembler masm, int disp, Register r1, Register r2) {
        masm.addl(r2, new AMD64Address(r1, disp));
        masm.movl(new AMD64Address(r1, disp), r2);
    }
}

