[experiment] add alternative wasm sqlite3 implementation available via build-tag (#2863)

This allows for building GoToSocial with [SQLite transpiled to WASM](https://github.com/ncruces/go-sqlite3) and accessed through [Wazero](https://wazero.io/).
2025-06-05 21:59:39 +02:00 · 2024-05-27 15:46:15 +00:00
parent cce21c11cb
commit 1e7b32490d
398 changed files with 86174 additions and 684 deletions
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/abi.go
@ -0,0 +1,170 @@
+package backend
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	// FunctionABI represents the ABI information for a function which corresponds to a ssa.Signature.
+	FunctionABI struct {
+		Initialized bool
+
+		Args, Rets                 []ABIArg
+		ArgStackSize, RetStackSize int64
+
+		ArgIntRealRegs   byte
+		ArgFloatRealRegs byte
+		RetIntRealRegs   byte
+		RetFloatRealRegs byte
+	}
+
+	// ABIArg represents either argument or return value's location.
+	ABIArg struct {
+		// Index is the index of the argument.
+		Index int
+		// Kind is the kind of the argument.
+		Kind ABIArgKind
+		// Reg is valid if Kind == ABIArgKindReg.
+		// This VReg must be based on RealReg.
+		Reg regalloc.VReg
+		// Offset is valid if Kind == ABIArgKindStack.
+		// This is the offset from the beginning of either arg or ret stack slot.
+		Offset int64
+		// Type is the type of the argument.
+		Type ssa.Type
+	}
+
+	// ABIArgKind is the kind of ABI argument.
+	ABIArgKind byte
+)
+
+const (
+	// ABIArgKindReg represents an argument passed in a register.
+	ABIArgKindReg = iota
+	// ABIArgKindStack represents an argument passed in the stack.
+	ABIArgKindStack
+)
+
+// String implements fmt.Stringer.
+func (a *ABIArg) String() string {
+	return fmt.Sprintf("args[%d]: %s", a.Index, a.Kind)
+}
+
+// String implements fmt.Stringer.
+func (a ABIArgKind) String() string {
+	switch a {
+	case ABIArgKindReg:
+		return "reg"
+	case ABIArgKindStack:
+		return "stack"
+	default:
+		panic("BUG")
+	}
+}
+
+// Init initializes the abiImpl for the given signature.
+func (a *FunctionABI) Init(sig *ssa.Signature, argResultInts, argResultFloats []regalloc.RealReg) {
+	if len(a.Rets) < len(sig.Results) {
+		a.Rets = make([]ABIArg, len(sig.Results))
+	}
+	a.Rets = a.Rets[:len(sig.Results)]
+	a.RetStackSize = a.setABIArgs(a.Rets, sig.Results, argResultInts, argResultFloats)
+	if argsNum := len(sig.Params); len(a.Args) < argsNum {
+		a.Args = make([]ABIArg, argsNum)
+	}
+	a.Args = a.Args[:len(sig.Params)]
+	a.ArgStackSize = a.setABIArgs(a.Args, sig.Params, argResultInts, argResultFloats)
+
+	// Gather the real registers usages in arg/return.
+	a.ArgIntRealRegs, a.ArgFloatRealRegs = 0, 0
+	a.RetIntRealRegs, a.RetFloatRealRegs = 0, 0
+	for i := range a.Rets {
+		r := &a.Rets[i]
+		if r.Kind == ABIArgKindReg {
+			if r.Type.IsInt() {
+				a.RetIntRealRegs++
+			} else {
+				a.RetFloatRealRegs++
+			}
+		}
+	}
+	for i := range a.Args {
+		arg := &a.Args[i]
+		if arg.Kind == ABIArgKindReg {
+			if arg.Type.IsInt() {
+				a.ArgIntRealRegs++
+			} else {
+				a.ArgFloatRealRegs++
+			}
+		}
+	}
+
+	a.Initialized = true
+}
+
+// setABIArgs sets the ABI arguments in the given slice. This assumes that len(s) >= len(types)
+// where if len(s) > len(types), the last elements of s is for the multi-return slot.
+func (a *FunctionABI) setABIArgs(s []ABIArg, types []ssa.Type, ints, floats []regalloc.RealReg) (stackSize int64) {
+	il, fl := len(ints), len(floats)
+
+	var stackOffset int64
+	intParamIndex, floatParamIndex := 0, 0
+	for i, typ := range types {
+		arg := &s[i]
+		arg.Index = i
+		arg.Type = typ
+		if typ.IsInt() {
+			if intParamIndex >= il {
+				arg.Kind = ABIArgKindStack
+				const slotSize = 8 // Align 8 bytes.
+				arg.Offset = stackOffset
+				stackOffset += slotSize
+			} else {
+				arg.Kind = ABIArgKindReg
+				arg.Reg = regalloc.FromRealReg(ints[intParamIndex], regalloc.RegTypeInt)
+				intParamIndex++
+			}
+		} else {
+			if floatParamIndex >= fl {
+				arg.Kind = ABIArgKindStack
+				slotSize := int64(8)   // Align at least 8 bytes.
+				if typ.Bits() == 128 { // Vector.
+					slotSize = 16
+				}
+				arg.Offset = stackOffset
+				stackOffset += slotSize
+			} else {
+				arg.Kind = ABIArgKindReg
+				arg.Reg = regalloc.FromRealReg(floats[floatParamIndex], regalloc.RegTypeFloat)
+				floatParamIndex++
+			}
+		}
+	}
+	return stackOffset
+}
+
+func (a *FunctionABI) AlignedArgResultStackSlotSize() uint32 {
+	stackSlotSize := a.RetStackSize + a.ArgStackSize
+	// Align stackSlotSize to 16 bytes.
+	stackSlotSize = (stackSlotSize + 15) &^ 15
+	// Check overflow 32-bit.
+	if stackSlotSize > 0xFFFFFFFF {
+		panic("ABI stack slot size overflow")
+	}
+	return uint32(stackSlotSize)
+}
+
+func (a *FunctionABI) ABIInfoAsUint64() uint64 {
+	return uint64(a.ArgIntRealRegs)<<56 |
+		uint64(a.ArgFloatRealRegs)<<48 |
+		uint64(a.RetIntRealRegs)<<40 |
+		uint64(a.RetFloatRealRegs)<<32 |
+		uint64(a.AlignedArgResultStackSlotSize())
+}
+
+func ABIInfoFromUint64(info uint64) (argIntRealRegs, argFloatRealRegs, retIntRealRegs, retFloatRealRegs byte, stackSlotSize uint32) {
+	return byte(info >> 56), byte(info >> 48), byte(info >> 40), byte(info >> 32), uint32(info)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/backend.go
@ -0,0 +1,3 @@
+// Package backend must be free of Wasm-specific concept. In other words,
+// this package must not import internal/wasm package.
+package backend
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler.go
@ -0,0 +1,417 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// NewCompiler returns a new Compiler that can generate a machine code.
+func NewCompiler(ctx context.Context, mach Machine, builder ssa.Builder) Compiler {
+	return newCompiler(ctx, mach, builder)
+}
+
+func newCompiler(_ context.Context, mach Machine, builder ssa.Builder) *compiler {
+	argResultInts, argResultFloats := mach.ArgsResultsRegs()
+	c := &compiler{
+		mach: mach, ssaBuilder: builder,
+		nextVRegID:      regalloc.VRegIDNonReservedBegin,
+		argResultInts:   argResultInts,
+		argResultFloats: argResultFloats,
+	}
+	mach.SetCompiler(c)
+	return c
+}
+
+// Compiler is the backend of wazevo which takes ssa.Builder and Machine,
+// use the information there to emit the final machine code.
+type Compiler interface {
+	// SSABuilder returns the ssa.Builder used by this compiler.
+	SSABuilder() ssa.Builder
+
+	// Compile executes the following steps:
+	// 	1. Lower()
+	// 	2. RegAlloc()
+	// 	3. Finalize()
+	// 	4. Encode()
+	//
+	// Each step can be called individually for testing purpose, therefore they are exposed in this interface too.
+	//
+	// The returned byte slices are the machine code and the relocation information for the machine code.
+	// The caller is responsible for copying them immediately since the compiler may reuse the buffer.
+	Compile(ctx context.Context) (_ []byte, _ []RelocationInfo, _ error)
+
+	// Lower lowers the given ssa.Instruction to the machine-specific instructions.
+	Lower()
+
+	// RegAlloc performs the register allocation after Lower is called.
+	RegAlloc()
+
+	// Finalize performs the finalization of the compilation, including machine code emission.
+	// This must be called after RegAlloc.
+	Finalize(ctx context.Context) error
+
+	// Buf returns the buffer of the encoded machine code. This is only used for testing purpose.
+	Buf() []byte
+
+	BufPtr() *[]byte
+
+	// Format returns the debug string of the current state of the compiler.
+	Format() string
+
+	// Init initializes the internal state of the compiler for the next compilation.
+	Init()
+
+	// AllocateVReg allocates a new virtual register of the given type.
+	AllocateVReg(typ ssa.Type) regalloc.VReg
+
+	// ValueDefinition returns the definition of the given value.
+	ValueDefinition(ssa.Value) *SSAValueDefinition
+
+	// VRegOf returns the virtual register of the given ssa.Value.
+	VRegOf(value ssa.Value) regalloc.VReg
+
+	// TypeOf returns the ssa.Type of the given virtual register.
+	TypeOf(regalloc.VReg) ssa.Type
+
+	// MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID,
+	// and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group.
+	MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool
+
+	// MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode,
+	// this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid.
+	//
+	// Note: caller should be careful to avoid excessive allocation on opcodes slice.
+	MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode
+
+	// AddRelocationInfo appends the relocation information for the function reference at the current buffer offset.
+	AddRelocationInfo(funcRef ssa.FuncRef)
+
+	// AddSourceOffsetInfo appends the source offset information for the given offset.
+	AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset)
+
+	// SourceOffsetInfo returns the source offset information for the current buffer offset.
+	SourceOffsetInfo() []SourceOffsetInfo
+
+	// EmitByte appends a byte to the buffer. Used during the code emission.
+	EmitByte(b byte)
+
+	// Emit4Bytes appends 4 bytes to the buffer. Used during the code emission.
+	Emit4Bytes(b uint32)
+
+	// Emit8Bytes appends 8 bytes to the buffer. Used during the code emission.
+	Emit8Bytes(b uint64)
+
+	// GetFunctionABI returns the ABI information for the given signature.
+	GetFunctionABI(sig *ssa.Signature) *FunctionABI
+}
+
+// RelocationInfo represents the relocation information for a call instruction.
+type RelocationInfo struct {
+	// Offset represents the offset from the beginning of the machine code of either a function or the entire module.
+	Offset int64
+	// Target is the target function of the call instruction.
+	FuncRef ssa.FuncRef
+}
+
+// compiler implements Compiler.
+type compiler struct {
+	mach       Machine
+	currentGID ssa.InstructionGroupID
+	ssaBuilder ssa.Builder
+	// nextVRegID is the next virtual register ID to be allocated.
+	nextVRegID regalloc.VRegID
+	// ssaValueToVRegs maps ssa.ValueID to regalloc.VReg.
+	ssaValueToVRegs [] /* VRegID to */ regalloc.VReg
+	// ssaValueDefinitions maps ssa.ValueID to its definition.
+	ssaValueDefinitions []SSAValueDefinition
+	// ssaValueRefCounts is a cached list obtained by ssa.Builder.ValueRefCounts().
+	ssaValueRefCounts []int
+	// returnVRegs is the list of virtual registers that store the return values.
+	returnVRegs  []regalloc.VReg
+	varEdges     [][2]regalloc.VReg
+	varEdgeTypes []ssa.Type
+	constEdges   []struct {
+		cInst *ssa.Instruction
+		dst   regalloc.VReg
+	}
+	vRegSet         []bool
+	vRegIDs         []regalloc.VRegID
+	tempRegs        []regalloc.VReg
+	tmpVals         []ssa.Value
+	ssaTypeOfVRegID [] /* VRegID to */ ssa.Type
+	buf             []byte
+	relocations     []RelocationInfo
+	sourceOffsets   []SourceOffsetInfo
+	// abis maps ssa.SignatureID to the ABI implementation.
+	abis                           []FunctionABI
+	argResultInts, argResultFloats []regalloc.RealReg
+}
+
+// SourceOffsetInfo is a data to associate the source offset with the executable offset.
+type SourceOffsetInfo struct {
+	// SourceOffset is the source offset in the original source code.
+	SourceOffset ssa.SourceOffset
+	// ExecutableOffset is the offset in the compiled executable.
+	ExecutableOffset int64
+}
+
+// Compile implements Compiler.Compile.
+func (c *compiler) Compile(ctx context.Context) ([]byte, []RelocationInfo, error) {
+	c.Lower()
+	if wazevoapi.PrintSSAToBackendIRLowering && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after lowering for %s ]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After lowering to ISA specific IR", c.Format())
+	}
+	c.RegAlloc()
+	if wazevoapi.PrintRegisterAllocated && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after regalloc for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Register Allocation", c.Format())
+	}
+	if err := c.Finalize(ctx); err != nil {
+		return nil, nil, err
+	}
+	if wazevoapi.PrintFinalizedMachineCode && wazevoapi.PrintEnabledIndex(ctx) {
+		fmt.Printf("[[[after finalize for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
+	}
+	if wazevoapi.DeterministicCompilationVerifierEnabled {
+		wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Finalization", c.Format())
+	}
+	return c.buf, c.relocations, nil
+}
+
+// RegAlloc implements Compiler.RegAlloc.
+func (c *compiler) RegAlloc() {
+	c.mach.RegAlloc()
+}
+
+// Finalize implements Compiler.Finalize.
+func (c *compiler) Finalize(ctx context.Context) error {
+	c.mach.PostRegAlloc()
+	return c.mach.Encode(ctx)
+}
+
+// setCurrentGroupID sets the current instruction group ID.
+func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) {
+	c.currentGID = gid
+}
+
+// assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder.
+func (c *compiler) assignVirtualRegisters() {
+	builder := c.ssaBuilder
+	refCounts := builder.ValueRefCounts()
+	c.ssaValueRefCounts = refCounts
+
+	need := len(refCounts)
+	if need >= len(c.ssaValueToVRegs) {
+		c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, need+1)...)
+	}
+	if need >= len(c.ssaValueDefinitions) {
+		c.ssaValueDefinitions = append(c.ssaValueDefinitions, make([]SSAValueDefinition, need+1)...)
+	}
+
+	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
+		// First we assign a virtual register to each parameter.
+		for i := 0; i < blk.Params(); i++ {
+			p := blk.Param(i)
+			pid := p.ID()
+			typ := p.Type()
+			vreg := c.AllocateVReg(typ)
+			c.ssaValueToVRegs[pid] = vreg
+			c.ssaValueDefinitions[pid] = SSAValueDefinition{BlockParamValue: p, BlkParamVReg: vreg}
+			c.ssaTypeOfVRegID[vreg.ID()] = p.Type()
+		}
+
+		// Assigns each value to a virtual register produced by instructions.
+		for cur := blk.Root(); cur != nil; cur = cur.Next() {
+			r, rs := cur.Returns()
+			var N int
+			if r.Valid() {
+				id := r.ID()
+				ssaTyp := r.Type()
+				typ := r.Type()
+				vReg := c.AllocateVReg(typ)
+				c.ssaValueToVRegs[id] = vReg
+				c.ssaValueDefinitions[id] = SSAValueDefinition{
+					Instr:    cur,
+					N:        0,
+					RefCount: refCounts[id],
+				}
+				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
+				N++
+			}
+			for _, r := range rs {
+				id := r.ID()
+				ssaTyp := r.Type()
+				vReg := c.AllocateVReg(ssaTyp)
+				c.ssaValueToVRegs[id] = vReg
+				c.ssaValueDefinitions[id] = SSAValueDefinition{
+					Instr:    cur,
+					N:        N,
+					RefCount: refCounts[id],
+				}
+				c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
+				N++
+			}
+		}
+	}
+
+	for i, retBlk := 0, builder.ReturnBlock(); i < retBlk.Params(); i++ {
+		typ := retBlk.Param(i).Type()
+		vReg := c.AllocateVReg(typ)
+		c.returnVRegs = append(c.returnVRegs, vReg)
+		c.ssaTypeOfVRegID[vReg.ID()] = typ
+	}
+}
+
+// AllocateVReg implements Compiler.AllocateVReg.
+func (c *compiler) AllocateVReg(typ ssa.Type) regalloc.VReg {
+	regType := regalloc.RegTypeOf(typ)
+	r := regalloc.VReg(c.nextVRegID).SetRegType(regType)
+
+	id := r.ID()
+	if int(id) >= len(c.ssaTypeOfVRegID) {
+		c.ssaTypeOfVRegID = append(c.ssaTypeOfVRegID, make([]ssa.Type, id+1)...)
+	}
+	c.ssaTypeOfVRegID[id] = typ
+	c.nextVRegID++
+	return r
+}
+
+// Init implements Compiler.Init.
+func (c *compiler) Init() {
+	c.currentGID = 0
+	c.nextVRegID = regalloc.VRegIDNonReservedBegin
+	c.returnVRegs = c.returnVRegs[:0]
+	c.mach.Reset()
+	c.varEdges = c.varEdges[:0]
+	c.constEdges = c.constEdges[:0]
+	c.buf = c.buf[:0]
+	c.sourceOffsets = c.sourceOffsets[:0]
+	c.relocations = c.relocations[:0]
+}
+
+// ValueDefinition implements Compiler.ValueDefinition.
+func (c *compiler) ValueDefinition(value ssa.Value) *SSAValueDefinition {
+	return &c.ssaValueDefinitions[value.ID()]
+}
+
+// VRegOf implements Compiler.VRegOf.
+func (c *compiler) VRegOf(value ssa.Value) regalloc.VReg {
+	return c.ssaValueToVRegs[value.ID()]
+}
+
+// Format implements Compiler.Format.
+func (c *compiler) Format() string {
+	return c.mach.Format()
+}
+
+// TypeOf implements Compiler.Format.
+func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type {
+	return c.ssaTypeOfVRegID[v.ID()]
+}
+
+// MatchInstr implements Compiler.MatchInstr.
+func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool {
+	instr := def.Instr
+	return def.IsFromInstr() &&
+		instr.Opcode() == opcode &&
+		instr.GroupID() == c.currentGID &&
+		def.RefCount < 2
+}
+
+// MatchInstrOneOf implements Compiler.MatchInstrOneOf.
+func (c *compiler) MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode {
+	instr := def.Instr
+	if !def.IsFromInstr() {
+		return ssa.OpcodeInvalid
+	}
+
+	if instr.GroupID() != c.currentGID {
+		return ssa.OpcodeInvalid
+	}
+
+	if def.RefCount >= 2 {
+		return ssa.OpcodeInvalid
+	}
+
+	opcode := instr.Opcode()
+	for _, op := range opcodes {
+		if opcode == op {
+			return opcode
+		}
+	}
+	return ssa.OpcodeInvalid
+}
+
+// SSABuilder implements Compiler .SSABuilder.
+func (c *compiler) SSABuilder() ssa.Builder {
+	return c.ssaBuilder
+}
+
+// AddSourceOffsetInfo implements Compiler.AddSourceOffsetInfo.
+func (c *compiler) AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) {
+	c.sourceOffsets = append(c.sourceOffsets, SourceOffsetInfo{
+		SourceOffset:     sourceOffset,
+		ExecutableOffset: executableOffset,
+	})
+}
+
+// SourceOffsetInfo implements Compiler.SourceOffsetInfo.
+func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo {
+	return c.sourceOffsets
+}
+
+// AddRelocationInfo implements Compiler.AddRelocationInfo.
+func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) {
+	c.relocations = append(c.relocations, RelocationInfo{
+		Offset:  int64(len(c.buf)),
+		FuncRef: funcRef,
+	})
+}
+
+// Emit8Bytes implements Compiler.Emit8Bytes.
+func (c *compiler) Emit8Bytes(b uint64) {
+	c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24), byte(b>>32), byte(b>>40), byte(b>>48), byte(b>>56))
+}
+
+// Emit4Bytes implements Compiler.Emit4Bytes.
+func (c *compiler) Emit4Bytes(b uint32) {
+	c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24))
+}
+
+// EmitByte implements Compiler.EmitByte.
+func (c *compiler) EmitByte(b byte) {
+	c.buf = append(c.buf, b)
+}
+
+// Buf implements Compiler.Buf.
+func (c *compiler) Buf() []byte {
+	return c.buf
+}
+
+// BufPtr implements Compiler.BufPtr.
+func (c *compiler) BufPtr() *[]byte {
+	return &c.buf
+}
+
+func (c *compiler) GetFunctionABI(sig *ssa.Signature) *FunctionABI {
+	if int(sig.ID) >= len(c.abis) {
+		c.abis = append(c.abis, make([]FunctionABI, int(sig.ID)+1)...)
+	}
+
+	abi := &c.abis[sig.ID]
+	if abi.Initialized {
+		return abi
+	}
+
+	abi.Init(sig, c.argResultInts, c.argResultFloats)
+	return abi
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/compiler_lower.go
@ -0,0 +1,226 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// Lower implements Compiler.Lower.
+func (c *compiler) Lower() {
+	c.assignVirtualRegisters()
+	c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature()))
+	c.mach.ExecutableContext().StartLoweringFunction(c.ssaBuilder.BlockIDMax())
+	c.lowerBlocks()
+}
+
+// lowerBlocks lowers each block in the ssa.Builder.
+func (c *compiler) lowerBlocks() {
+	builder := c.ssaBuilder
+	for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
+		c.lowerBlock(blk)
+	}
+
+	ectx := c.mach.ExecutableContext()
+	// After lowering all blocks, we need to link adjacent blocks to layout one single instruction list.
+	var prev ssa.BasicBlock
+	for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() {
+		if prev != nil {
+			ectx.LinkAdjacentBlocks(prev, next)
+		}
+		prev = next
+	}
+}
+
+func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
+	mach := c.mach
+	ectx := mach.ExecutableContext()
+	ectx.StartBlock(blk)
+
+	// We traverse the instructions in reverse order because we might want to lower multiple
+	// instructions together.
+	cur := blk.Tail()
+
+	// First gather the branching instructions at the end of the blocks.
+	var br0, br1 *ssa.Instruction
+	if cur.IsBranching() {
+		br0 = cur
+		cur = cur.Prev()
+		if cur != nil && cur.IsBranching() {
+			br1 = cur
+			cur = cur.Prev()
+		}
+	}
+
+	if br0 != nil {
+		c.lowerBranches(br0, br1)
+	}
+
+	if br1 != nil && br0 == nil {
+		panic("BUG? when a block has conditional branch but doesn't end with an unconditional branch?")
+	}
+
+	// Now start lowering the non-branching instructions.
+	for ; cur != nil; cur = cur.Prev() {
+		c.setCurrentGroupID(cur.GroupID())
+		if cur.Lowered() {
+			continue
+		}
+
+		switch cur.Opcode() {
+		case ssa.OpcodeReturn:
+			rets := cur.ReturnVals()
+			if len(rets) > 0 {
+				c.mach.LowerReturns(rets)
+			}
+			c.mach.InsertReturn()
+		default:
+			mach.LowerInstr(cur)
+		}
+		ectx.FlushPendingInstructions()
+	}
+
+	// Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg.
+	if blk.EntryBlock() {
+		c.lowerFunctionArguments(blk)
+	}
+
+	ectx.EndBlock()
+}
+
+// lowerBranches is called right after StartBlock and before any LowerInstr call if
+// there are branches to the given block. br0 is the very end of the block and b1 is the before the br0 if it exists.
+// At least br0 is not nil, but br1 can be nil if there's no branching before br0.
+//
+// See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock.
+func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) {
+	ectx := c.mach.ExecutableContext()
+
+	c.setCurrentGroupID(br0.GroupID())
+	c.mach.LowerSingleBranch(br0)
+	ectx.FlushPendingInstructions()
+	if br1 != nil {
+		c.setCurrentGroupID(br1.GroupID())
+		c.mach.LowerConditionalBranch(br1)
+		ectx.FlushPendingInstructions()
+	}
+
+	if br0.Opcode() == ssa.OpcodeJump {
+		_, args, target := br0.BranchData()
+		argExists := len(args) != 0
+		if argExists && br1 != nil {
+			panic("BUG: critical edge split failed")
+		}
+		if argExists && target.ReturnBlock() {
+			if len(args) > 0 {
+				c.mach.LowerReturns(args)
+			}
+		} else if argExists {
+			c.lowerBlockArguments(args, target)
+		}
+	}
+	ectx.FlushPendingInstructions()
+}
+
+func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) {
+	ectx := c.mach.ExecutableContext()
+
+	c.tmpVals = c.tmpVals[:0]
+	for i := 0; i < entry.Params(); i++ {
+		p := entry.Param(i)
+		if c.ssaValueRefCounts[p.ID()] > 0 {
+			c.tmpVals = append(c.tmpVals, p)
+		} else {
+			// If the argument is not used, we can just pass an invalid value.
+			c.tmpVals = append(c.tmpVals, ssa.ValueInvalid)
+		}
+	}
+	c.mach.LowerParams(c.tmpVals)
+	ectx.FlushPendingInstructions()
+}
+
+// lowerBlockArguments lowers how to pass arguments to the given successor block.
+func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) {
+	if len(args) != succ.Params() {
+		panic("BUG: mismatched number of arguments")
+	}
+
+	c.varEdges = c.varEdges[:0]
+	c.varEdgeTypes = c.varEdgeTypes[:0]
+	c.constEdges = c.constEdges[:0]
+	for i := 0; i < len(args); i++ {
+		dst := succ.Param(i)
+		src := args[i]
+
+		dstReg := c.VRegOf(dst)
+		srcDef := c.ssaValueDefinitions[src.ID()]
+		if srcDef.IsFromInstr() && srcDef.Instr.Constant() {
+			c.constEdges = append(c.constEdges, struct {
+				cInst *ssa.Instruction
+				dst   regalloc.VReg
+			}{cInst: srcDef.Instr, dst: dstReg})
+		} else {
+			srcReg := c.VRegOf(src)
+			// Even when the src=dst, insert the move so that we can keep such registers keep-alive.
+			c.varEdges = append(c.varEdges, [2]regalloc.VReg{srcReg, dstReg})
+			c.varEdgeTypes = append(c.varEdgeTypes, src.Type())
+		}
+	}
+
+	// Check if there's an overlap among the dsts and srcs in varEdges.
+	c.vRegIDs = c.vRegIDs[:0]
+	for _, edge := range c.varEdges {
+		src := edge[0].ID()
+		if int(src) >= len(c.vRegSet) {
+			c.vRegSet = append(c.vRegSet, make([]bool, src+1)...)
+		}
+		c.vRegSet[src] = true
+		c.vRegIDs = append(c.vRegIDs, src)
+	}
+	separated := true
+	for _, edge := range c.varEdges {
+		dst := edge[1].ID()
+		if int(dst) >= len(c.vRegSet) {
+			c.vRegSet = append(c.vRegSet, make([]bool, dst+1)...)
+		} else {
+			if c.vRegSet[dst] {
+				separated = false
+				break
+			}
+		}
+	}
+	for _, id := range c.vRegIDs {
+		c.vRegSet[id] = false // reset for the next use.
+	}
+
+	if separated {
+		// If there's no overlap, we can simply move the source to destination.
+		for i, edge := range c.varEdges {
+			src, dst := edge[0], edge[1]
+			c.mach.InsertMove(dst, src, c.varEdgeTypes[i])
+		}
+	} else {
+		// Otherwise, we allocate a temporary registers and move the source to the temporary register,
+		//
+		// First move all of them to temporary registers.
+		c.tempRegs = c.tempRegs[:0]
+		for i, edge := range c.varEdges {
+			src := edge[0]
+			typ := c.varEdgeTypes[i]
+			temp := c.AllocateVReg(typ)
+			c.tempRegs = append(c.tempRegs, temp)
+			c.mach.InsertMove(temp, src, typ)
+		}
+		// Then move the temporary registers to the destination.
+		for i, edge := range c.varEdges {
+			temp := c.tempRegs[i]
+			dst := edge[1]
+			c.mach.InsertMove(dst, temp, c.varEdgeTypes[i])
+		}
+	}
+
+	// Finally, move the constants.
+	for _, edge := range c.constEdges {
+		cInst, dst := edge.cInst, edge.dst
+		c.mach.InsertLoadConstantBlockArg(cInst, dst)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/executable_context.go
@ -0,0 +1,219 @@
+package backend
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type ExecutableContext interface {
+	// StartLoweringFunction is called when the lowering of the given function is started.
+	// maximumBlockID is the maximum value of ssa.BasicBlockID existing in the function.
+	StartLoweringFunction(maximumBlockID ssa.BasicBlockID)
+
+	// LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list.
+	LinkAdjacentBlocks(prev, next ssa.BasicBlock)
+
+	// StartBlock is called when the compilation of the given block is started.
+	// The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with
+	// ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd.
+	StartBlock(ssa.BasicBlock)
+
+	// EndBlock is called when the compilation of the current block is finished.
+	EndBlock()
+
+	// FlushPendingInstructions flushes the pending instructions to the buffer.
+	// This will be called after the lowering of each SSA Instruction.
+	FlushPendingInstructions()
+}
+
+type ExecutableContextT[Instr any] struct {
+	CurrentSSABlk ssa.BasicBlock
+
+	// InstrPool is the InstructionPool of instructions.
+	InstructionPool wazevoapi.Pool[Instr]
+	asNop           func(*Instr)
+	setNext         func(*Instr, *Instr)
+	setPrev         func(*Instr, *Instr)
+
+	// RootInstr is the root instruction of the executable.
+	RootInstr         *Instr
+	labelPositionPool wazevoapi.Pool[LabelPosition[Instr]]
+	NextLabel         Label
+	// LabelPositions maps a label to the instructions of the region which the label represents.
+	LabelPositions     map[Label]*LabelPosition[Instr]
+	OrderedBlockLabels []*LabelPosition[Instr]
+
+	// PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
+	PerBlockHead, PerBlockEnd *Instr
+	// PendingInstructions are the instructions which are not yet emitted into the instruction list.
+	PendingInstructions []*Instr
+
+	// SsaBlockIDToLabels maps an SSA block ID to the label.
+	SsaBlockIDToLabels []Label
+}
+
+func NewExecutableContextT[Instr any](
+	resetInstruction func(*Instr),
+	setNext func(*Instr, *Instr),
+	setPrev func(*Instr, *Instr),
+	asNop func(*Instr),
+) *ExecutableContextT[Instr] {
+	return &ExecutableContextT[Instr]{
+		InstructionPool:   wazevoapi.NewPool[Instr](resetInstruction),
+		asNop:             asNop,
+		setNext:           setNext,
+		setPrev:           setPrev,
+		labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]),
+		LabelPositions:    make(map[Label]*LabelPosition[Instr]),
+		NextLabel:         LabelInvalid,
+	}
+}
+
+func resetLabelPosition[T any](l *LabelPosition[T]) {
+	*l = LabelPosition[T]{}
+}
+
+// StartLoweringFunction implements ExecutableContext.
+func (e *ExecutableContextT[Instr]) StartLoweringFunction(max ssa.BasicBlockID) {
+	imax := int(max)
+	if len(e.SsaBlockIDToLabels) <= imax {
+		// Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration.
+		e.SsaBlockIDToLabels = append(e.SsaBlockIDToLabels, make([]Label, imax+1)...)
+	}
+}
+
+func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) {
+	e.CurrentSSABlk = blk
+
+	l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
+	if l == LabelInvalid {
+		l = e.AllocateLabel()
+		e.SsaBlockIDToLabels[blk.ID()] = l
+	}
+
+	end := e.allocateNop0()
+	e.PerBlockHead, e.PerBlockEnd = end, end
+
+	labelPos, ok := e.LabelPositions[l]
+	if !ok {
+		labelPos = e.AllocateLabelPosition(l)
+		e.LabelPositions[l] = labelPos
+	}
+	e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos)
+	labelPos.Begin, labelPos.End = end, end
+	labelPos.SB = blk
+}
+
+// EndBlock implements ExecutableContext.
+func (e *ExecutableContextT[T]) EndBlock() {
+	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
+	e.insertAtPerBlockHead(e.allocateNop0())
+
+	l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
+	e.LabelPositions[l].Begin = e.PerBlockHead
+
+	if e.CurrentSSABlk.EntryBlock() {
+		e.RootInstr = e.PerBlockHead
+	}
+}
+
+func (e *ExecutableContextT[T]) insertAtPerBlockHead(i *T) {
+	if e.PerBlockHead == nil {
+		e.PerBlockHead = i
+		e.PerBlockEnd = i
+		return
+	}
+	e.setNext(i, e.PerBlockHead)
+	e.setPrev(e.PerBlockHead, i)
+	e.PerBlockHead = i
+}
+
+// FlushPendingInstructions implements ExecutableContext.
+func (e *ExecutableContextT[T]) FlushPendingInstructions() {
+	l := len(e.PendingInstructions)
+	if l == 0 {
+		return
+	}
+	for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
+		e.insertAtPerBlockHead(e.PendingInstructions[i])
+	}
+	e.PendingInstructions = e.PendingInstructions[:0]
+}
+
+func (e *ExecutableContextT[T]) Reset() {
+	e.labelPositionPool.Reset()
+	e.InstructionPool.Reset()
+	for l := Label(0); l <= e.NextLabel; l++ {
+		delete(e.LabelPositions, l)
+	}
+	e.PendingInstructions = e.PendingInstructions[:0]
+	e.OrderedBlockLabels = e.OrderedBlockLabels[:0]
+	e.RootInstr = nil
+	e.SsaBlockIDToLabels = e.SsaBlockIDToLabels[:0]
+	e.PerBlockHead, e.PerBlockEnd = nil, nil
+	e.NextLabel = LabelInvalid
+}
+
+// AllocateLabel allocates an unused label.
+func (e *ExecutableContextT[T]) AllocateLabel() Label {
+	e.NextLabel++
+	return e.NextLabel
+}
+
+func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] {
+	l := e.labelPositionPool.Allocate()
+	l.L = la
+	return l
+}
+
+func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label {
+	if blk.ReturnBlock() {
+		return LabelReturn
+	}
+	l := e.SsaBlockIDToLabels[blk.ID()]
+	if l == LabelInvalid {
+		l = e.AllocateLabel()
+		e.SsaBlockIDToLabels[blk.ID()] = l
+	}
+	return l
+}
+
+func (e *ExecutableContextT[T]) allocateNop0() *T {
+	i := e.InstructionPool.Allocate()
+	e.asNop(i)
+	return i
+}
+
+// LinkAdjacentBlocks implements backend.Machine.
+func (e *ExecutableContextT[T]) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
+	prevLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(prev)]
+	nextLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(next)]
+	e.setNext(prevLabelPos.End, nextLabelPos.Begin)
+}
+
+// LabelPosition represents the regions of the generated code which the label represents.
+type LabelPosition[Instr any] struct {
+	SB           ssa.BasicBlock
+	L            Label
+	Begin, End   *Instr
+	BinaryOffset int64
+}
+
+// Label represents a position in the generated code which is either
+// a real instruction or the constant InstructionPool (e.g. jump tables).
+//
+// This is exactly the same as the traditional "label" in assembly code.
+type Label uint32
+
+const (
+	LabelInvalid Label = 0
+	LabelReturn  Label = math.MaxUint32
+)
+
+// String implements backend.Machine.
+func (l Label) String() string {
+	return fmt.Sprintf("L%d", l)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/go_call.go
@ -0,0 +1,33 @@
+package backend
+
+import "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+
+// GoFunctionCallRequiredStackSize returns the size of the stack required for the Go function call.
+// argBegin is the index of the first argument in the signature which is not either execution context or module context.
+func GoFunctionCallRequiredStackSize(sig *ssa.Signature, argBegin int) (ret, retUnaligned int64) {
+	var paramNeededInBytes, resultNeededInBytes int64
+	for _, p := range sig.Params[argBegin:] {
+		s := int64(p.Size())
+		if s < 8 {
+			s = 8 // We use uint64 for all basic types, except SIMD v128.
+		}
+		paramNeededInBytes += s
+	}
+	for _, r := range sig.Results {
+		s := int64(r.Size())
+		if s < 8 {
+			s = 8 // We use uint64 for all basic types, except SIMD v128.
+		}
+		resultNeededInBytes += s
+	}
+
+	if paramNeededInBytes > resultNeededInBytes {
+		ret = paramNeededInBytes
+	} else {
+		ret = resultNeededInBytes
+	}
+	retUnaligned = ret
+	// Align to 16 bytes.
+	ret = (ret + 15) &^ 15
+	return
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi.go
@ -0,0 +1,186 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// For the details of the ABI, see:
+// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#amd64-architecture
+
+var (
+	intArgResultRegs   = []regalloc.RealReg{rax, rbx, rcx, rdi, rsi, r8, r9, r10, r11}
+	floatArgResultRegs = []regalloc.RealReg{xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7}
+)
+
+var regInfo = &regalloc.RegisterInfo{
+	AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
+		regalloc.RegTypeInt: {
+			rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15,
+		},
+		regalloc.RegTypeFloat: {
+			xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+		},
+	},
+	CalleeSavedRegisters: regalloc.NewRegSet(
+		rdx, r12, r13, r14, r15,
+		xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+	),
+	CallerSavedRegisters: regalloc.NewRegSet(
+		rax, rcx, rbx, rsi, rdi, r8, r9, r10, r11,
+		xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+	),
+	RealRegToVReg: []regalloc.VReg{
+		rax: raxVReg, rcx: rcxVReg, rdx: rdxVReg, rbx: rbxVReg, rsp: rspVReg, rbp: rbpVReg, rsi: rsiVReg, rdi: rdiVReg,
+		r8: r8VReg, r9: r9VReg, r10: r10VReg, r11: r11VReg, r12: r12VReg, r13: r13VReg, r14: r14VReg, r15: r15VReg,
+		xmm0: xmm0VReg, xmm1: xmm1VReg, xmm2: xmm2VReg, xmm3: xmm3VReg, xmm4: xmm4VReg, xmm5: xmm5VReg, xmm6: xmm6VReg,
+		xmm7: xmm7VReg, xmm8: xmm8VReg, xmm9: xmm9VReg, xmm10: xmm10VReg, xmm11: xmm11VReg, xmm12: xmm12VReg,
+		xmm13: xmm13VReg, xmm14: xmm14VReg, xmm15: xmm15VReg,
+	},
+	RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
+	RealRegType: func(r regalloc.RealReg) regalloc.RegType {
+		if r < xmm0 {
+			return regalloc.RegTypeInt
+		}
+		return regalloc.RegTypeFloat
+	},
+}
+
+// ArgsResultsRegs implements backend.Machine.
+func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
+	return intArgResultRegs, floatArgResultRegs
+}
+
+// LowerParams implements backend.Machine.
+func (m *machine) LowerParams(args []ssa.Value) {
+	a := m.currentABI
+
+	for i, ssaArg := range args {
+		if !ssaArg.Valid() {
+			continue
+		}
+		reg := m.c.VRegOf(ssaArg)
+		arg := &a.Args[i]
+		if arg.Kind == backend.ABIArgKindReg {
+			m.InsertMove(reg, arg.Reg, arg.Type)
+		} else {
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |
+			//          |      arg X      |
+			//          |     .......     |
+			//          |      arg 1      |
+			//          |      arg 0      |
+			//          |   ReturnAddress |
+			//          |    Caller_RBP   |
+			//          +-----------------+ <-- RBP
+			//          |   ...........   |
+			//          |   clobbered  M  |
+			//          |   ............  |
+			//          |   clobbered  0  |
+			//          |   spill slot N  |
+			//          |   ...........   |
+			//          |   spill slot 0  |
+			//   RSP--> +-----------------+
+			//             (low address)
+
+			// Load the value from the arg stack slot above the current RBP.
+			load := m.allocateInstr()
+			mem := newOperandMem(m.newAmodeImmRBPReg(uint32(arg.Offset + 16)))
+			switch arg.Type {
+			case ssa.TypeI32:
+				load.asMovzxRmR(extModeLQ, mem, reg)
+			case ssa.TypeI64:
+				load.asMov64MR(mem, reg)
+			case ssa.TypeF32:
+				load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
+			case ssa.TypeF64:
+				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
+			case ssa.TypeV128:
+				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
+			default:
+				panic("BUG")
+			}
+			m.insert(load)
+		}
+	}
+}
+
+// LowerReturns implements backend.Machine.
+func (m *machine) LowerReturns(rets []ssa.Value) {
+	// Load the XMM registers first as it might need a temporary register to inline
+	// constant return.
+	a := m.currentABI
+	for i, ret := range rets {
+		r := &a.Rets[i]
+		if !r.Type.IsInt() {
+			m.LowerReturn(ret, r)
+		}
+	}
+	// Then load the GPR registers.
+	for i, ret := range rets {
+		r := &a.Rets[i]
+		if r.Type.IsInt() {
+			m.LowerReturn(ret, r)
+		}
+	}
+}
+
+func (m *machine) LowerReturn(ret ssa.Value, r *backend.ABIArg) {
+	reg := m.c.VRegOf(ret)
+	if def := m.c.ValueDefinition(ret); def.IsFromInstr() {
+		// Constant instructions are inlined.
+		if inst := def.Instr; inst.Constant() {
+			m.insertLoadConstant(inst, reg)
+		}
+	}
+	if r.Kind == backend.ABIArgKindReg {
+		m.InsertMove(r.Reg, reg, ret.Type())
+	} else {
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |   ReturnAddress |
+		//          |    Caller_RBP   |
+		//          +-----------------+ <-- RBP
+		//          |   ...........   |
+		//          |   clobbered  M  |
+		//          |   ............  |
+		//          |   clobbered  0  |
+		//          |   spill slot N  |
+		//          |   ...........   |
+		//          |   spill slot 0  |
+		//   RSP--> +-----------------+
+		//             (low address)
+
+		// Store the value to the return stack slot above the current RBP.
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmRBPReg(uint32(m.currentABI.ArgStackSize + 16 + r.Offset)))
+		switch r.Type {
+		case ssa.TypeI32:
+			store.asMovRM(reg, mem, 4)
+		case ssa.TypeI64:
+			store.asMovRM(reg, mem, 8)
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, reg, mem)
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
+		}
+		m.insert(store)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.go
@ -0,0 +1,9 @@
+package amd64
+
+// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
+// This implements wazevo.entrypoint, and see the comments there for detail.
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
+// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_amd64.s
@ -0,0 +1,29 @@
+#include "funcdata.h"
+#include "textflag.h"
+
+// entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr
+TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
+	MOVQ preambleExecutable+0(FP), R11
+	MOVQ functionExectuable+8(FP), R14
+	MOVQ executionContextPtr+16(FP), AX       // First argument is passed in AX.
+	MOVQ moduleContextPtr+24(FP), BX          // Second argument is passed in BX.
+	MOVQ paramResultSlicePtr+32(FP), R12
+	MOVQ goAllocatedStackSlicePtr+40(FP), R13
+	JMP  R11
+
+// afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
+TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
+	MOVQ executable+0(FP), CX
+	MOVQ executionContextPtr+8(FP), AX // First argument is passed in AX.
+
+	// Save the stack pointer and frame pointer.
+	MOVQ BP, 16(AX) // 16 == ExecutionContextOffsetOriginalFramePointer
+	MOVQ SP, 24(AX) // 24 == ExecutionContextOffsetOriginalStackPointer
+
+	// Then set the stack pointer and frame pointer to the values we got from the Go runtime.
+	MOVQ framePointer+24(FP), BP
+
+	// WARNING: do not update SP before BP, because the Go translates (FP) as (SP) + 8.
+	MOVQ stackPointer+16(FP), SP
+
+	JMP CX
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_entry_preamble.go
@ -0,0 +1,248 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var (
+	executionContextPtrReg = raxVReg
+
+	// Followings are callee saved registers. They can be used freely in the entry preamble
+	// since the preamble is called via Go assembly function which has stack-based ABI.
+
+	// savedExecutionContextPtr also must be a callee-saved reg so that they can be used in the prologue and epilogue.
+	savedExecutionContextPtr = rdxVReg
+	// paramResultSlicePtr must match with entrypoint function in abi_entry_amd64.s.
+	paramResultSlicePtr = r12VReg
+	// goAllocatedStackPtr must match with entrypoint function in abi_entry_amd64.s.
+	goAllocatedStackPtr = r13VReg
+	// functionExecutable must match with entrypoint function in abi_entry_amd64.s.
+	functionExecutable = r14VReg
+	tmpIntReg          = r15VReg
+	tmpXmmReg          = xmm15VReg
+)
+
+// CompileEntryPreamble implements backend.Machine.
+func (m *machine) CompileEntryPreamble(sig *ssa.Signature) []byte {
+	root := m.compileEntryPreamble(sig)
+	m.encodeWithoutSSA(root)
+	buf := m.c.Buf()
+	return buf
+}
+
+func (m *machine) compileEntryPreamble(sig *ssa.Signature) *instruction {
+	abi := backend.FunctionABI{}
+	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
+
+	root := m.allocateNop()
+
+	//// ----------------------------------- prologue ----------------------------------- ////
+
+	// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
+	// 		mov %executionContextPtrReg, %savedExecutionContextPtr
+	cur := m.move64(executionContextPtrReg, savedExecutionContextPtr, root)
+
+	// Next is to save the original RBP and RSP into the execution context.
+	cur = m.saveOriginalRSPRBP(cur)
+
+	// Now set the RSP to the Go-allocated stack pointer.
+	// 		mov %goAllocatedStackPtr, %rsp
+	cur = m.move64(goAllocatedStackPtr, rspVReg, cur)
+
+	if stackSlotSize := abi.AlignedArgResultStackSlotSize(); stackSlotSize > 0 {
+		// Allocate stack slots for the arguments and return values.
+		// 		sub $stackSlotSize, %rsp
+		spDec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(stackSlotSize)), rspVReg, true)
+		cur = linkInstr(cur, spDec)
+	}
+
+	var offset uint32
+	for i := range abi.Args {
+		if i < 2 {
+			// module context ptr and execution context ptr are passed in rax and rbx by the Go assembly function.
+			continue
+		}
+		arg := &abi.Args[i]
+		cur = m.goEntryPreamblePassArg(cur, paramResultSlicePtr, offset, arg)
+		if arg.Type == ssa.TypeV128 {
+			offset += 16
+		} else {
+			offset += 8
+		}
+	}
+
+	// Zero out RBP so that the unwind/stack growth code can correctly detect the end of the stack.
+	zerosRbp := m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(rbpVReg), rbpVReg, true)
+	cur = linkInstr(cur, zerosRbp)
+
+	// Now ready to call the real function. Note that at this point stack pointer is already set to the Go-allocated,
+	// which is aligned to 16 bytes.
+	call := m.allocateInstr().asCallIndirect(newOperandReg(functionExecutable), &abi)
+	cur = linkInstr(cur, call)
+
+	//// ----------------------------------- epilogue ----------------------------------- ////
+
+	// Read the results from regs and the stack, and set them correctly into the paramResultSlicePtr.
+	offset = 0
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, offset, r, uint32(abi.ArgStackSize))
+		if r.Type == ssa.TypeV128 {
+			offset += 16
+		} else {
+			offset += 8
+		}
+	}
+
+	// Finally, restore the original RBP and RSP.
+	cur = m.restoreOriginalRSPRBP(cur)
+
+	ret := m.allocateInstr().asRet()
+	linkInstr(cur, ret)
+	return root
+}
+
+// saveOriginalRSPRBP saves the original RSP and RBP into the execution context.
+func (m *machine) saveOriginalRSPRBP(cur *instruction) *instruction {
+	// 		mov %rbp, wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg)
+	// 		mov %rsp, wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg)
+	cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, true, cur)
+	cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, true, cur)
+	return cur
+}
+
+// restoreOriginalRSPRBP restores the original RSP and RBP from the execution context.
+func (m *machine) restoreOriginalRSPRBP(cur *instruction) *instruction {
+	// 		mov wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg), %rbp
+	// 		mov wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg), %rsp
+	cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, false, cur)
+	cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, false, cur)
+	return cur
+}
+
+func (m *machine) move64(src, dst regalloc.VReg, prev *instruction) *instruction {
+	mov := m.allocateInstr().asMovRR(src, dst, true)
+	return linkInstr(prev, mov)
+}
+
+func (m *machine) loadOrStore64AtExecutionCtx(execCtx regalloc.VReg, offset wazevoapi.Offset, r regalloc.VReg, store bool, prev *instruction) *instruction {
+	mem := newOperandMem(m.newAmodeImmReg(offset.U32(), execCtx))
+	instr := m.allocateInstr()
+	if store {
+		instr.asMovRM(r, mem, 8)
+	} else {
+		instr.asMov64MR(mem, r)
+	}
+	return linkInstr(prev, instr)
+}
+
+// This is for debugging.
+func (m *machine) linkUD2(cur *instruction) *instruction { //nolint
+	return linkInstr(cur, m.allocateInstr().asUD2())
+}
+
+func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, offsetInParamSlice uint32, arg *backend.ABIArg) *instruction {
+	var dst regalloc.VReg
+	argTyp := arg.Type
+	if arg.Kind == backend.ABIArgKindStack {
+		// Caller saved registers ca
+		switch argTyp {
+		case ssa.TypeI32, ssa.TypeI64:
+			dst = tmpIntReg
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			dst = tmpXmmReg
+		default:
+			panic("BUG")
+		}
+	} else {
+		dst = arg.Reg
+	}
+
+	load := m.allocateInstr()
+	a := newOperandMem(m.newAmodeImmReg(offsetInParamSlice, paramSlicePtr))
+	switch arg.Type {
+	case ssa.TypeI32:
+		load.asMovzxRmR(extModeLQ, a, dst)
+	case ssa.TypeI64:
+		load.asMov64MR(a, dst)
+	case ssa.TypeF32:
+		load.asXmmUnaryRmR(sseOpcodeMovss, a, dst)
+	case ssa.TypeF64:
+		load.asXmmUnaryRmR(sseOpcodeMovsd, a, dst)
+	case ssa.TypeV128:
+		load.asXmmUnaryRmR(sseOpcodeMovdqu, a, dst)
+	}
+
+	cur = linkInstr(cur, load)
+	if arg.Kind == backend.ABIArgKindStack {
+		// Store back to the stack.
+		store := m.allocateInstr()
+		a := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset), rspVReg))
+		switch arg.Type {
+		case ssa.TypeI32:
+			store.asMovRM(dst, a, 4)
+		case ssa.TypeI64:
+			store.asMovRM(dst, a, 8)
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, dst, a)
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, dst, a)
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, dst, a)
+		}
+		cur = linkInstr(cur, store)
+	}
+	return cur
+}
+
+func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, offsetInResultSlice uint32, result *backend.ABIArg, resultStackSlotBeginOffset uint32) *instruction {
+	var r regalloc.VReg
+	if result.Kind == backend.ABIArgKindStack {
+		// Load the value to the temporary.
+		load := m.allocateInstr()
+		offset := resultStackSlotBeginOffset + uint32(result.Offset)
+		a := newOperandMem(m.newAmodeImmReg(offset, rspVReg))
+		switch result.Type {
+		case ssa.TypeI32:
+			r = tmpIntReg
+			load.asMovzxRmR(extModeLQ, a, r)
+		case ssa.TypeI64:
+			r = tmpIntReg
+			load.asMov64MR(a, r)
+		case ssa.TypeF32:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovss, a, r)
+		case ssa.TypeF64:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovsd, a, r)
+		case ssa.TypeV128:
+			r = tmpXmmReg
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, a, r)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+	} else {
+		r = result.Reg
+	}
+
+	store := m.allocateInstr()
+	a := newOperandMem(m.newAmodeImmReg(offsetInResultSlice, resultSlicePtr))
+	switch result.Type {
+	case ssa.TypeI32:
+		store.asMovRM(r, a, 4)
+	case ssa.TypeI64:
+		store.asMovRM(r, a, 8)
+	case ssa.TypeF32:
+		store.asXmmMovRM(sseOpcodeMovss, r, a)
+	case ssa.TypeF64:
+		store.asXmmMovRM(sseOpcodeMovsd, r, a)
+	case ssa.TypeV128:
+		store.asXmmMovRM(sseOpcodeMovdqu, r, a)
+	}
+
+	return linkInstr(cur, store)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/abi_go_call.go
@ -0,0 +1,443 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var calleeSavedVRegs = []regalloc.VReg{
+	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
+	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
+}
+
+// CompileGoFunctionTrampoline implements backend.Machine.
+func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
+	ectx := m.ectx
+	argBegin := 1 // Skips exec context by default.
+	if needModuleContextPtr {
+		argBegin++
+	}
+
+	abi := &backend.FunctionABI{}
+	abi.Init(sig, intArgResultRegs, floatArgResultRegs)
+	m.currentABI = abi
+
+	cur := m.allocateNop()
+	ectx.RootInstr = cur
+
+	// Execution context is always the first argument.
+	execCtrPtr := raxVReg
+
+	// First we update RBP and RSP just like the normal prologue.
+	//
+	//                   (high address)                     (high address)
+	//       RBP ----> +-----------------+                +-----------------+
+	//                 |     .......     |                |     .......     |
+	//                 |      ret Y      |                |      ret Y      |
+	//                 |     .......     |                |     .......     |
+	//                 |      ret 0      |                |      ret 0      |
+	//                 |      arg X      |                |      arg X      |
+	//                 |     .......     |     ====>      |     .......     |
+	//                 |      arg 1      |                |      arg 1      |
+	//                 |      arg 0      |                |      arg 0      |
+	//                 |   Return Addr   |                |   Return Addr   |
+	//       RSP ----> +-----------------+                |    Caller_RBP   |
+	//                    (low address)                   +-----------------+ <----- RSP, RBP
+	//
+	cur = m.setupRBPRSP(cur)
+
+	goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
+	cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur)
+
+	// Save the callee saved registers.
+	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
+
+	if needModuleContextPtr {
+		moduleCtrPtr := rbxVReg // Module context is always the second argument.
+		mem := m.newAmodeImmReg(
+			wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(),
+			execCtrPtr)
+		store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8)
+		cur = linkInstr(cur, store)
+	}
+
+	// Now let's advance the RSP to the stack slot for the arguments.
+	//
+	//                (high address)                     (high address)
+	//              +-----------------+               +-----------------+
+	//              |     .......     |               |     .......     |
+	//              |      ret Y      |               |      ret Y      |
+	//              |     .......     |               |     .......     |
+	//              |      ret 0      |               |      ret 0      |
+	//              |      arg X      |               |      arg X      |
+	//              |     .......     |   =======>    |     .......     |
+	//              |      arg 1      |               |      arg 1      |
+	//              |      arg 0      |               |      arg 0      |
+	//              |   Return Addr   |               |   Return Addr   |
+	//              |    Caller_RBP   |               |    Caller_RBP   |
+	//  RBP,RSP --> +-----------------+               +-----------------+ <----- RBP
+	//                 (low address)                  |  arg[N]/ret[M]  |
+	//                                                |    ..........   |
+	//                                                |  arg[1]/ret[1]  |
+	//                                                |  arg[0]/ret[0]  |
+	//                                                +-----------------+ <----- RSP
+	//                                                   (low address)
+	//
+	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
+	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
+	// the arguments/return values to/from Go function.
+	cur = m.addRSP(-int32(goSliceSizeAligned), cur)
+
+	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
+	var offsetInGoSlice int32
+	for i := range abi.Args[argBegin:] {
+		arg := &abi.Args[argBegin+i]
+		var v regalloc.VReg
+		if arg.Kind == backend.ABIArgKindReg {
+			v = arg.Reg
+		} else {
+			// We have saved callee saved registers, so we can use them.
+			if arg.Type.IsInt() {
+				v = r15VReg
+			} else {
+				v = xmm15VReg
+			}
+			mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
+			load := m.allocateInstr()
+			switch arg.Type {
+			case ssa.TypeI32:
+				load.asMovzxRmR(extModeLQ, mem, v)
+			case ssa.TypeI64:
+				load.asMov64MR(mem, v)
+			case ssa.TypeF32:
+				load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
+			case ssa.TypeF64:
+				load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
+			case ssa.TypeV128:
+				load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+			default:
+				panic("BUG")
+			}
+			cur = linkInstr(cur, load)
+		}
+
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
+		switch arg.Type {
+		case ssa.TypeI32:
+			store.asMovRM(v, mem, 4)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeI64:
+			store.asMovRM(v, mem, 8)
+			offsetInGoSlice += 8
+		case ssa.TypeF32:
+			store.asXmmMovRM(sseOpcodeMovss, v, mem)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeF64:
+			store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+			offsetInGoSlice += 8
+		case ssa.TypeV128:
+			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+			offsetInGoSlice += 16
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, store)
+	}
+
+	// Finally we push the size of the slice to the stack so the stack looks like:
+	//
+	//          (high address)
+	//       +-----------------+
+	//       |     .......     |
+	//       |      ret Y      |
+	//       |     .......     |
+	//       |      ret 0      |
+	//       |      arg X      |
+	//       |     .......     |
+	//       |      arg 1      |
+	//       |      arg 0      |
+	//       |   Return Addr   |
+	//       |    Caller_RBP   |
+	//       +-----------------+ <----- RBP
+	//       |  arg[N]/ret[M]  |
+	//       |    ..........   |
+	//       |  arg[1]/ret[1]  |
+	//       |  arg[0]/ret[0]  |
+	//       |    slice size   |
+	//       +-----------------+ <----- RSP
+	//         (low address)
+	//
+	// 		push $sliceSize
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned))))
+
+	// Load the exitCode to the register.
+	exitCodeReg := r12VReg // Callee saved which is already saved.
+	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false))
+
+	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
+	cur = linkInstr(cur, setExitCode)
+	cur = linkInstr(cur, saveRsp)
+	cur = linkInstr(cur, saveRbp)
+
+	// Ready to exit the execution.
+	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
+
+	// We don't need the slice size anymore, so pop it.
+	cur = m.addRSP(8, cur)
+
+	// Ready to set up the results.
+	offsetInGoSlice = 0
+	// To avoid overwriting with the execution context pointer by the result, we need to track the offset,
+	// and defer the restoration of the result to the end of this function.
+	var argOverlapWithExecCtxOffset int32 = -1
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		var v regalloc.VReg
+		isRegResult := r.Kind == backend.ABIArgKindReg
+		if isRegResult {
+			v = r.Reg
+			if v.RealReg() == execCtrPtr.RealReg() {
+				argOverlapWithExecCtxOffset = offsetInGoSlice
+				offsetInGoSlice += 8 // always uint64 rep.
+				continue
+			}
+		} else {
+			if r.Type.IsInt() {
+				v = r15VReg
+			} else {
+				v = xmm15VReg
+			}
+		}
+
+		load := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
+		switch r.Type {
+		case ssa.TypeI32:
+			load.asMovzxRmR(extModeLQ, mem, v)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeI64:
+			load.asMov64MR(mem, v)
+			offsetInGoSlice += 8
+		case ssa.TypeF32:
+			load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
+			offsetInGoSlice += 8 // always uint64 rep.
+		case ssa.TypeF64:
+			load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
+			offsetInGoSlice += 8
+		case ssa.TypeV128:
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+			offsetInGoSlice += 16
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+
+		if !isRegResult {
+			// We need to store it back to the result slot above rbp.
+			store := m.allocateInstr()
+			mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
+			switch r.Type {
+			case ssa.TypeI32:
+				store.asMovRM(v, mem, 4)
+			case ssa.TypeI64:
+				store.asMovRM(v, mem, 8)
+			case ssa.TypeF32:
+				store.asXmmMovRM(sseOpcodeMovss, v, mem)
+			case ssa.TypeF64:
+				store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+			case ssa.TypeV128:
+				store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+			default:
+				panic("BUG")
+			}
+			cur = linkInstr(cur, store)
+		}
+	}
+
+	// Before return, we need to restore the callee saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
+
+	if argOverlapWithExecCtxOffset >= 0 {
+		// At this point execCtt is not used anymore, so we can finally store the
+		// result to the register which overlaps with the execution context pointer.
+		mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg))
+		load := m.allocateInstr().asMov64MR(mem, execCtrPtr)
+		cur = linkInstr(cur, load)
+	}
+
+	// Finally ready to return.
+	cur = m.revertRBPRSP(cur)
+	linkInstr(cur, m.allocateInstr().asRet())
+
+	m.encodeWithoutSSA(ectx.RootInstr)
+	return m.c.Buf()
+}
+
+func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		store := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			store.asMovRM(v, mem, 8)
+		case regalloc.RegTypeFloat:
+			store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, store)
+		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
+	}
+	return cur
+}
+
+func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		load := m.allocateInstr()
+		mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			load.asMov64MR(mem, v)
+		case regalloc.RegTypeFloat:
+			load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
+		default:
+			panic("BUG")
+		}
+		cur = linkInstr(cur, load)
+		offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
+	}
+	return cur
+}
+
+func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction {
+	readRip := m.allocateInstr()
+	cur = linkInstr(cur, readRip)
+
+	ripReg := r12VReg // Callee saved which is already saved.
+	saveRip := m.allocateInstr().asMovRM(
+		ripReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
+		8,
+	)
+	cur = linkInstr(cur, saveRip)
+
+	exit := m.allocateExitSeq(execCtx)
+	cur = linkInstr(cur, exit)
+
+	nop, l := m.allocateBrTarget()
+	cur = linkInstr(cur, nop)
+	readRip.asLEA(newOperandLabel(l), ripReg)
+	return cur
+}
+
+// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
+// stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the
+// execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it.
+var stackGrowSaveVRegs = []regalloc.VReg{
+	rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
+	rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg,
+	xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
+	xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg,
+}
+
+// CompileStackGrowCallSequence implements backend.Machine.
+func (m *machine) CompileStackGrowCallSequence() []byte {
+	ectx := m.ectx
+
+	cur := m.allocateNop()
+	ectx.RootInstr = cur
+
+	cur = m.setupRBPRSP(cur)
+
+	// Execution context is always the first argument.
+	execCtrPtr := raxVReg
+
+	// Save the callee saved and argument registers.
+	cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
+
+	// Load the exitCode to the register.
+	exitCodeReg := r12VReg // Already saved.
+	cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false))
+
+	saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
+	cur = linkInstr(cur, setExitCode)
+	cur = linkInstr(cur, saveRsp)
+	cur = linkInstr(cur, saveRbp)
+
+	// Ready to exit the execution.
+	cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
+
+	// After the exit, restore the saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
+
+	// Finally ready to return.
+	cur = m.revertRBPRSP(cur)
+	linkInstr(cur, m.allocateInstr().asRet())
+
+	m.encodeWithoutSSA(ectx.RootInstr)
+	return m.c.Buf()
+}
+
+// insertStackBoundsCheck will insert the instructions after `cur` to check the
+// stack bounds, and if there's no sufficient spaces required for the function,
+// exit the execution and try growing it in Go world.
+func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
+	//		add $requiredStackSize, %rsp ;; Temporarily update the sp.
+	// 		cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp.
+	// 		ja .ok
+	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
+	//      pushq r15 ;; save the temporary.
+	//		mov $requiredStackSize, %r15
+	//		mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context.
+	//      popq r15 ;; restore the temporary.
+	//		callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack.
+	//		jmp .cont
+	// .ok:
+	//		sub $requiredStackSize, %rsp ;; Reverse the temporary update.
+	// .cont:
+	cur = m.addRSP(-int32(requiredStackSize), cur)
+	cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)),
+		rspVReg, true))
+
+	ja := m.allocateInstr()
+	cur = linkInstr(cur, ja)
+
+	cur = m.addRSP(int32(requiredStackSize), cur)
+
+	// Save the temporary.
+
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg)))
+	// Load the required size to the temporary.
+	cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true))
+	// Set the required size in the execution context.
+	cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg,
+		newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8))
+	// Restore the temporary.
+	cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg))
+	// Call the Go function to grow the stack.
+	cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg(
+		wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil))
+	// Jump to the continuation.
+	jmpToCont := m.allocateInstr()
+	cur = linkInstr(cur, jmpToCont)
+
+	// .ok:
+	okInstr, ok := m.allocateBrTarget()
+	cur = linkInstr(cur, okInstr)
+	ja.asJmpIf(condNBE, newOperandLabel(ok))
+	// On the ok path, we only need to reverse the temporary update.
+	cur = m.addRSP(int32(requiredStackSize), cur)
+
+	// .cont:
+	contInstr, cont := m.allocateBrTarget()
+	cur = linkInstr(cur, contInstr)
+	jmpToCont.asJmp(newOperandLabel(cont))
+
+	return cur
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/cond.go
@ -0,0 +1,168 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type cond byte
+
+const (
+	// condO represents (overflow) condition.
+	condO cond = iota
+	// condNO represents (no overflow) condition.
+	condNO
+	// condB represents (< unsigned) condition.
+	condB
+	// condNB represents (>= unsigned) condition.
+	condNB
+	// condZ represents (zero) condition.
+	condZ
+	// condNZ represents (not-zero) condition.
+	condNZ
+	// condBE represents (<= unsigned) condition.
+	condBE
+	// condNBE represents (> unsigned) condition.
+	condNBE
+	// condS represents (negative) condition.
+	condS
+	// condNS represents (not-negative) condition.
+	condNS
+	// condP represents (parity) condition.
+	condP
+	// condNP represents (not parity) condition.
+	condNP
+	// condL represents (< signed) condition.
+	condL
+	// condNL represents (>= signed) condition.
+	condNL
+	// condLE represents (<= signed) condition.
+	condLE
+	// condNLE represents (> signed) condition.
+	condNLE
+
+	condInvalid
+)
+
+func (c cond) String() string {
+	switch c {
+	case condO:
+		return "o"
+	case condNO:
+		return "no"
+	case condB:
+		return "b"
+	case condNB:
+		return "nb"
+	case condZ:
+		return "z"
+	case condNZ:
+		return "nz"
+	case condBE:
+		return "be"
+	case condNBE:
+		return "nbe"
+	case condS:
+		return "s"
+	case condNS:
+		return "ns"
+	case condL:
+		return "l"
+	case condNL:
+		return "nl"
+	case condLE:
+		return "le"
+	case condNLE:
+		return "nle"
+	case condP:
+		return "p"
+	case condNP:
+		return "np"
+	default:
+		panic("unreachable")
+	}
+}
+
+func condFromSSAIntCmpCond(origin ssa.IntegerCmpCond) cond {
+	switch origin {
+	case ssa.IntegerCmpCondEqual:
+		return condZ
+	case ssa.IntegerCmpCondNotEqual:
+		return condNZ
+	case ssa.IntegerCmpCondSignedLessThan:
+		return condL
+	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+		return condNL
+	case ssa.IntegerCmpCondSignedGreaterThan:
+		return condNLE
+	case ssa.IntegerCmpCondSignedLessThanOrEqual:
+		return condLE
+	case ssa.IntegerCmpCondUnsignedLessThan:
+		return condB
+	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
+		return condNB
+	case ssa.IntegerCmpCondUnsignedGreaterThan:
+		return condNBE
+	case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+		return condBE
+	default:
+		panic("unreachable")
+	}
+}
+
+func condFromSSAFloatCmpCond(origin ssa.FloatCmpCond) cond {
+	switch origin {
+	case ssa.FloatCmpCondGreaterThanOrEqual:
+		return condNB
+	case ssa.FloatCmpCondGreaterThan:
+		return condNBE
+	case ssa.FloatCmpCondEqual, ssa.FloatCmpCondNotEqual, ssa.FloatCmpCondLessThan, ssa.FloatCmpCondLessThanOrEqual:
+		panic(fmt.Sprintf("cond %s must be treated as a special case", origin))
+	default:
+		panic("unreachable")
+	}
+}
+
+func (c cond) encoding() byte {
+	return byte(c)
+}
+
+func (c cond) invert() cond {
+	switch c {
+	case condO:
+		return condNO
+	case condNO:
+		return condO
+	case condB:
+		return condNB
+	case condNB:
+		return condB
+	case condZ:
+		return condNZ
+	case condNZ:
+		return condZ
+	case condBE:
+		return condNBE
+	case condNBE:
+		return condBE
+	case condS:
+		return condNS
+	case condNS:
+		return condS
+	case condP:
+		return condNP
+	case condNP:
+		return condP
+	case condL:
+		return condNL
+	case condNL:
+		return condL
+	case condLE:
+		return condNLE
+	case condNLE:
+		return condLE
+	default:
+		panic("unreachable")
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/ext.go
@ -0,0 +1,35 @@
+package amd64
+
+// extMode represents the mode of extension in movzx/movsx.
+type extMode byte
+
+const (
+	// extModeBL represents Byte -> Longword.
+	extModeBL extMode = iota
+	// extModeBQ represents Byte -> Quadword.
+	extModeBQ
+	// extModeWL represents Word -> Longword.
+	extModeWL
+	// extModeWQ represents Word -> Quadword.
+	extModeWQ
+	// extModeLQ represents Longword -> Quadword.
+	extModeLQ
+)
+
+// String implements fmt.Stringer.
+func (e extMode) String() string {
+	switch e {
+	case extModeBL:
+		return "bl"
+	case extModeBQ:
+		return "bq"
+	case extModeWL:
+		return "wl"
+	case extModeWQ:
+		return "wq"
+	case extModeLQ:
+		return "lq"
+	default:
+		panic("BUG: invalid ext mode")
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/instr_encoding.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_constant.go
@ -0,0 +1,71 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
+func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+
+	vr = m.c.AllocateVReg(valType)
+	m.insertLoadConstant(instr, vr)
+	return
+}
+
+// InsertLoadConstantBlockArg implements backend.Machine.
+func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
+	m.insertLoadConstant(instr, vr)
+}
+
+func (m *machine) insertLoadConstant(instr *ssa.Instruction, vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+	v := instr.ConstantVal()
+
+	bits := valType.Bits()
+	if bits < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
+		v = v & ((1 << valType.Bits()) - 1)
+	}
+
+	switch valType {
+	case ssa.TypeF32, ssa.TypeF64:
+		m.lowerFconst(vr, v, bits == 64)
+	case ssa.TypeI32, ssa.TypeI64:
+		m.lowerIconst(vr, v, bits == 64)
+	default:
+		panic("BUG")
+	}
+}
+
+func (m *machine) lowerFconst(dst regalloc.VReg, c uint64, _64 bool) {
+	if c == 0 {
+		xor := m.allocateInstr().asZeros(dst)
+		m.insert(xor)
+	} else {
+		var tmpType ssa.Type
+		if _64 {
+			tmpType = ssa.TypeI64
+		} else {
+			tmpType = ssa.TypeI32
+		}
+		tmpInt := m.c.AllocateVReg(tmpType)
+		loadToGP := m.allocateInstr().asImm(tmpInt, c, _64)
+		m.insert(loadToGP)
+
+		movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpInt), dst, _64)
+		m.insert(movToXmm)
+	}
+}
+
+func (m *machine) lowerIconst(dst regalloc.VReg, c uint64, _64 bool) {
+	i := m.allocateInstr()
+	if c == 0 {
+		i.asZeros(dst)
+	} else {
+		i.asImm(dst, c, _64)
+	}
+	m.insert(i)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/lower_mem.go
@ -0,0 +1,187 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+var addendsMatchOpcodes = [...]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst, ssa.OpcodeIshl}
+
+type addend struct {
+	r     regalloc.VReg
+	off   int64
+	shift byte
+}
+
+func (a addend) String() string {
+	return fmt.Sprintf("addend{r=%s, off=%d, shift=%d}", a.r, a.off, a.shift)
+}
+
+// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32) (am *amode) {
+	def := m.c.ValueDefinition(ptr)
+
+	if offsetBase&0x80000000 != 0 {
+		// Special casing the huge base offset whose MSB is set. In x64, the immediate is always
+		// sign-extended, but our IR semantics requires the offset base is always unsigned.
+		// Note that this should be extremely rare or even this shouldn't hit in the real application,
+		// therefore we don't need to optimize this case in my opinion.
+
+		a := m.lowerAddend(def)
+		off64 := a.off + int64(offsetBase)
+		offsetBaseReg := m.c.AllocateVReg(ssa.TypeI64)
+		m.lowerIconst(offsetBaseReg, uint64(off64), true)
+		if a.r != regalloc.VRegInvalid {
+			return m.newAmodeRegRegShift(0, offsetBaseReg, a.r, a.shift)
+		} else {
+			return m.newAmodeImmReg(0, offsetBaseReg)
+		}
+	}
+
+	if op := m.c.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op == ssa.OpcodeIadd {
+		add := def.Instr
+		x, y := add.Arg2()
+		xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
+		ax := m.lowerAddend(xDef)
+		ay := m.lowerAddend(yDef)
+		add.MarkLowered()
+		return m.lowerAddendsToAmode(ax, ay, offsetBase)
+	} else {
+		// If it is not an Iadd, then we lower the one addend.
+		a := m.lowerAddend(def)
+		// off is always 0 if r is valid.
+		if a.r != regalloc.VRegInvalid {
+			if a.shift != 0 {
+				tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+				m.lowerIconst(tmpReg, 0, true)
+				return m.newAmodeRegRegShift(offsetBase, tmpReg, a.r, a.shift)
+			}
+			return m.newAmodeImmReg(offsetBase, a.r)
+		} else {
+			off64 := a.off + int64(offsetBase)
+			tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(tmpReg, uint64(off64), true)
+			return m.newAmodeImmReg(0, tmpReg)
+		}
+	}
+}
+
+func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode {
+	if x.r != regalloc.VRegInvalid && x.off != 0 || y.r != regalloc.VRegInvalid && y.off != 0 {
+		panic("invalid input")
+	}
+
+	u64 := uint64(x.off+y.off) + uint64(offBase)
+	if u64 != 0 {
+		if _, ok := asImm32(u64, false); !ok {
+			tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(tmpReg, u64, true)
+			// Blank u64 as it has been already lowered.
+			u64 = 0
+
+			if x.r == regalloc.VRegInvalid {
+				x.r = tmpReg
+			} else if y.r == regalloc.VRegInvalid {
+				y.r = tmpReg
+			} else {
+				// We already know that either rx or ry is invalid,
+				// so we overwrite it with the temporary register.
+				panic("BUG")
+			}
+		}
+	}
+
+	u32 := uint32(u64)
+	switch {
+	// We assume rx, ry are valid iff offx, offy are 0.
+	case x.r != regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
+		switch {
+		case x.shift != 0 && y.shift != 0:
+			// Cannot absorb two shifted registers, must lower one to a shift instruction.
+			shifted := m.allocateInstr()
+			shifted.asShiftR(shiftROpShiftLeft, newOperandImm32(uint32(x.shift)), x.r, true)
+			m.insert(shifted)
+
+			return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
+		case x.shift != 0 && y.shift == 0:
+			// Swap base and index.
+			x, y = y, x
+			fallthrough
+		default:
+			return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
+		}
+	case x.r == regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
+		x, y = y, x
+		fallthrough
+	case x.r != regalloc.VRegInvalid && y.r == regalloc.VRegInvalid:
+		if x.shift != 0 {
+			zero := m.c.AllocateVReg(ssa.TypeI64)
+			m.lowerIconst(zero, 0, true)
+			return m.newAmodeRegRegShift(u32, zero, x.r, x.shift)
+		}
+		return m.newAmodeImmReg(u32, x.r)
+	default: // Both are invalid: use the offset.
+		tmpReg := m.c.AllocateVReg(ssa.TypeI64)
+		m.lowerIconst(tmpReg, u64, true)
+		return m.newAmodeImmReg(0, tmpReg)
+	}
+}
+
+func (m *machine) lowerAddend(x *backend.SSAValueDefinition) addend {
+	if x.IsFromBlockParam() {
+		return addend{x.BlkParamVReg, 0, 0}
+	}
+	// Ensure the addend is not referenced in multiple places; we will discard nested Iadds.
+	op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:])
+	if op != ssa.OpcodeInvalid && op != ssa.OpcodeIadd {
+		return m.lowerAddendFromInstr(x.Instr)
+	}
+	p := m.getOperand_Reg(x)
+	return addend{p.reg(), 0, 0}
+}
+
+// lowerAddendFromInstr takes an instruction returns a Vreg and an offset that can be used in an address mode.
+// The Vreg is regalloc.VRegInvalid if the addend cannot be lowered to a register.
+// The offset is 0 if the addend can be lowered to a register.
+func (m *machine) lowerAddendFromInstr(instr *ssa.Instruction) addend {
+	instr.MarkLowered()
+	switch op := instr.Opcode(); op {
+	case ssa.OpcodeIconst:
+		u64 := instr.ConstantVal()
+		if instr.Return().Type().Bits() == 32 {
+			return addend{regalloc.VRegInvalid, int64(int32(u64)), 0} // sign-extend.
+		} else {
+			return addend{regalloc.VRegInvalid, int64(u64), 0}
+		}
+	case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
+		input := instr.Arg()
+		inputDef := m.c.ValueDefinition(input)
+		if input.Type().Bits() != 32 {
+			panic("BUG: invalid input type " + input.Type().String())
+		}
+		constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
+		switch {
+		case constInst && op == ssa.OpcodeSExtend:
+			return addend{regalloc.VRegInvalid, int64(uint32(inputDef.Instr.ConstantVal())), 0}
+		case constInst && op == ssa.OpcodeUExtend:
+			return addend{regalloc.VRegInvalid, int64(int32(inputDef.Instr.ConstantVal())), 0} // sign-extend!
+		default:
+			r := m.getOperand_Reg(inputDef)
+			return addend{r.reg(), 0, 0}
+		}
+	case ssa.OpcodeIshl:
+		// If the addend is a shift, we can only handle it if the shift amount is a constant.
+		x, amount := instr.Arg2()
+		amountDef := m.c.ValueDefinition(amount)
+		if amountDef.IsFromInstr() && amountDef.Instr.Constant() && amountDef.Instr.ConstantVal() <= 3 {
+			r := m.getOperand_Reg(m.c.ValueDefinition(x))
+			return addend{r.reg(), 0, uint8(amountDef.Instr.ConstantVal())}
+		}
+		r := m.getOperand_Reg(m.c.ValueDefinition(x))
+		return addend{r.reg(), 0, 0}
+	}
+	panic("BUG: invalid opcode")
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_pro_epi_logue.go
@ -0,0 +1,304 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// PostRegAlloc implements backend.Machine.
+func (m *machine) PostRegAlloc() {
+	m.setupPrologue()
+	m.postRegAlloc()
+}
+
+func (m *machine) setupPrologue() {
+	cur := m.ectx.RootInstr
+	prevInitInst := cur.next
+
+	// At this point, we have the stack layout as follows:
+	//
+	//                   (high address)
+	//                 +-----------------+ <----- RBP (somewhere in the middle of the stack)
+	//                 |     .......     |
+	//                 |      ret Y      |
+	//                 |     .......     |
+	//                 |      ret 0      |
+	//                 |      arg X      |
+	//                 |     .......     |
+	//                 |      arg 1      |
+	//                 |      arg 0      |
+	//                 |   Return Addr   |
+	//       RSP ----> +-----------------+
+	//                    (low address)
+
+	// First, we push the RBP, and update the RBP to the current RSP.
+	//
+	//                   (high address)                     (high address)
+	//       RBP ----> +-----------------+                +-----------------+
+	//                 |     .......     |                |     .......     |
+	//                 |      ret Y      |                |      ret Y      |
+	//                 |     .......     |                |     .......     |
+	//                 |      ret 0      |                |      ret 0      |
+	//                 |      arg X      |                |      arg X      |
+	//                 |     .......     |     ====>      |     .......     |
+	//                 |      arg 1      |                |      arg 1      |
+	//                 |      arg 0      |                |      arg 0      |
+	//                 |   Return Addr   |                |   Return Addr   |
+	//       RSP ----> +-----------------+                |    Caller_RBP   |
+	//                    (low address)                   +-----------------+ <----- RSP, RBP
+	//
+	cur = m.setupRBPRSP(cur)
+
+	if !m.stackBoundsCheckDisabled {
+		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
+	}
+
+	//
+	//            (high address)
+	//          +-----------------+                  +-----------------+
+	//          |     .......     |                  |     .......     |
+	//          |      ret Y      |                  |      ret Y      |
+	//          |     .......     |                  |     .......     |
+	//          |      ret 0      |                  |      ret 0      |
+	//          |      arg X      |                  |      arg X      |
+	//          |     .......     |                  |     .......     |
+	//          |      arg 1      |                  |      arg 1      |
+	//          |      arg 0      |                  |      arg 0      |
+	//          |      xxxxx      |                  |      xxxxx      |
+	//          |   Return Addr   |                  |   Return Addr   |
+	//          |    Caller_RBP   |      ====>       |    Caller_RBP   |
+	// RBP,RSP->+-----------------+                  +-----------------+ <----- RBP
+	//             (low address)                     |   clobbered M   |
+	//                                               |   clobbered 1   |
+	//                                               |   ...........   |
+	//                                               |   clobbered 0   |
+	//                                               +-----------------+ <----- RSP
+	//
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		for i := range regs {
+			r := regs[len(regs)-1-i] // Reverse order.
+			if r.RegType() == regalloc.RegTypeInt {
+				cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r)))
+			} else {
+				// Push the XMM register is not supported by the PUSH instruction.
+				cur = m.addRSP(-16, cur)
+				push := m.allocateInstr().asXmmMovRM(
+					sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)),
+				)
+				cur = linkInstr(cur, push)
+			}
+		}
+	}
+
+	if size := m.spillSlotSize; size > 0 {
+		// Simply decrease the RSP to allocate the spill slots.
+		// 		sub $size, %rsp
+		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true))
+
+		// At this point, we have the stack layout as follows:
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |   ReturnAddress |
+		//          |   Caller_RBP    |
+		//          +-----------------+ <--- RBP
+		//          |    clobbered M  |
+		//          |   ............  |
+		//          |    clobbered 1  |
+		//          |    clobbered 0  |
+		//          |   spill slot N  |
+		//          |   ............  |
+		//          |   spill slot 0  |
+		//          +-----------------+ <--- RSP
+		//             (low address)
+	}
+
+	linkInstr(cur, prevInitInst)
+}
+
+// postRegAlloc does multiple things while walking through the instructions:
+// 1. Inserts the epilogue code.
+// 2. Removes the redundant copy instruction.
+// 3. Inserts the dec/inc RSP instruction right before/after the call instruction.
+// 4. Lowering that is supposed to be done after regalloc.
+func (m *machine) postRegAlloc() {
+	ectx := m.ectx
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch k := cur.kind; k {
+		case ret:
+			m.setupEpilogueAfter(cur.prev)
+			continue
+		case fcvtToSintSequence, fcvtToUintSequence:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			if k == fcvtToSintSequence {
+				m.lowerFcvtToSintSequenceAfterRegalloc(cur)
+			} else {
+				m.lowerFcvtToUintSequenceAfterRegalloc(cur)
+			}
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case xmmCMov:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.lowerXmmCmovAfterRegAlloc(cur)
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case idivRemSequence:
+			m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
+			m.lowerIDivRemSequenceAfterRegAlloc(cur)
+			prev := cur.prev
+			next := cur.next
+			cur := prev
+			for _, instr := range m.ectx.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			continue
+		case call, callIndirect:
+			// At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction
+			// right before/after the call instruction. If this is done before reg alloc, the stack slot
+			// can point to the wrong location and therefore results in a wrong value.
+			call := cur
+			next := call.next
+			_, _, _, _, size := backend.ABIInfoFromUint64(call.u2)
+			if size > 0 {
+				dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true)
+				linkInstr(call.prev, dec)
+				linkInstr(dec, call)
+				inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true)
+				linkInstr(call, inc)
+				linkInstr(inc, next)
+			}
+			continue
+		}
+
+		// Removes the redundant copy instruction.
+		if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() {
+			prev, next := cur.prev, cur.next
+			// Remove the copy instruction.
+			prev.next = next
+			if next != nil {
+				next.prev = prev
+			}
+		}
+	}
+}
+
+func (m *machine) setupEpilogueAfter(cur *instruction) {
+	prevNext := cur.next
+
+	// At this point, we have the stack layout as follows:
+	//
+	//            (high address)
+	//          +-----------------+
+	//          |     .......     |
+	//          |      ret Y      |
+	//          |     .......     |
+	//          |      ret 0      |
+	//          |      arg X      |
+	//          |     .......     |
+	//          |      arg 1      |
+	//          |      arg 0      |
+	//          |   ReturnAddress |
+	//          |   Caller_RBP    |
+	//          +-----------------+ <--- RBP
+	//          |    clobbered M  |
+	//          |   ............  |
+	//          |    clobbered 1  |
+	//          |    clobbered 0  |
+	//          |   spill slot N  |
+	//          |   ............  |
+	//          |   spill slot 0  |
+	//          +-----------------+ <--- RSP
+	//             (low address)
+
+	if size := m.spillSlotSize; size > 0 {
+		// Simply increase the RSP to free the spill slots.
+		// 		add $size, %rsp
+		cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true))
+	}
+
+	//
+	//             (high address)
+	//            +-----------------+                     +-----------------+
+	//            |     .......     |                     |     .......     |
+	//            |      ret Y      |                     |      ret Y      |
+	//            |     .......     |                     |     .......     |
+	//            |      ret 0      |                     |      ret 0      |
+	//            |      arg X      |                     |      arg X      |
+	//            |     .......     |                     |     .......     |
+	//            |      arg 1      |                     |      arg 1      |
+	//            |      arg 0      |                     |      arg 0      |
+	//            |   ReturnAddress |                     |   ReturnAddress |
+	//            |    Caller_RBP   |                     |    Caller_RBP   |
+	//   RBP ---> +-----------------+      ========>      +-----------------+ <---- RSP, RBP
+	//            |    clobbered M  |
+	//            |   ............  |
+	//            |    clobbered 1  |
+	//            |    clobbered 0  |
+	//   RSP ---> +-----------------+
+	//               (low address)
+	//
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		for _, r := range regs {
+			if r.RegType() == regalloc.RegTypeInt {
+				cur = linkInstr(cur, m.allocateInstr().asPop64(r))
+			} else {
+				// Pop the XMM register is not supported by the POP instruction.
+				pop := m.allocateInstr().asXmmUnaryRmR(
+					sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r,
+				)
+				cur = linkInstr(cur, pop)
+				cur = m.addRSP(16, cur)
+			}
+		}
+	}
+
+	// Now roll back the RSP to RBP, and pop the caller's RBP.
+	cur = m.revertRBPRSP(cur)
+
+	linkInstr(cur, prevNext)
+}
+
+func (m *machine) addRSP(offset int32, cur *instruction) *instruction {
+	if offset == 0 {
+		return cur
+	}
+	opcode := aluRmiROpcodeAdd
+	if offset < 0 {
+		opcode = aluRmiROpcodeSub
+		offset = -offset
+	}
+	return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true))
+}
+
+func (m *machine) setupRBPRSP(cur *instruction) *instruction {
+	cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg)))
+	cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true))
+	return cur
+}
+
+func (m *machine) revertRBPRSP(cur *instruction) *instruction {
+	cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true))
+	cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg))
+	return cur
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_regalloc.go
@ -0,0 +1,153 @@
+package amd64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// InsertMoveBefore implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+	typ := src.RegType()
+	if typ != dst.RegType() {
+		panic("BUG: src and dst must have the same type")
+	}
+
+	mov := m.allocateInstr()
+	if typ == regalloc.RegTypeInt {
+		mov.asMovRR(src, dst, true)
+	} else {
+		mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
+	}
+
+	cur := instr.prev
+	prevNext := cur.next
+	cur = linkInstr(cur, mov)
+	linkInstr(cur, prevNext)
+}
+
+// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.c.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	store := m.allocateInstr()
+	mem := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
+	switch typ {
+	case ssa.TypeI32:
+		store.asMovRM(v, mem, 4)
+	case ssa.TypeI64:
+		store.asMovRM(v, mem, 8)
+	case ssa.TypeF32:
+		store.asXmmMovRM(sseOpcodeMovss, v, mem)
+	case ssa.TypeF64:
+		store.asXmmMovRM(sseOpcodeMovsd, v, mem)
+	case ssa.TypeV128:
+		store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
+	}
+
+	cur = linkInstr(cur, store)
+	return linkInstr(cur, prevNext)
+}
+
+// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.c.TypeOf(v)
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	// Load the value to the temporary.
+	load := m.allocateInstr()
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	a := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
+	switch typ {
+	case ssa.TypeI32:
+		load.asMovzxRmR(extModeLQ, a, v)
+	case ssa.TypeI64:
+		load.asMov64MR(a, v)
+	case ssa.TypeF32:
+		load.asXmmUnaryRmR(sseOpcodeMovss, a, v)
+	case ssa.TypeF64:
+		load.asXmmUnaryRmR(sseOpcodeMovsd, a, v)
+	case ssa.TypeV128:
+		load.asXmmUnaryRmR(sseOpcodeMovdqu, a, v)
+	default:
+		panic("BUG")
+	}
+
+	cur = linkInstr(cur, load)
+	return linkInstr(cur, prevNext)
+}
+
+// ClobberedRegisters implements backend.RegAllocFunctionMachine.
+func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
+	m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+}
+
+// Swap implements backend.RegAllocFunctionMachine.
+func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+	if x1.RegType() == regalloc.RegTypeInt {
+		prevNext := cur.next
+		xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8)
+		cur = linkInstr(cur, xc)
+		linkInstr(cur, prevNext)
+	} else {
+		if tmp.Valid() {
+			prevNext := cur.next
+			m.InsertMoveBefore(tmp, x1, prevNext)
+			m.InsertMoveBefore(x1, x2, prevNext)
+			m.InsertMoveBefore(x2, tmp, prevNext)
+		} else {
+			prevNext := cur.next
+			r2 := x2.RealReg()
+			// Temporarily spill x1 to stack.
+			cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+			// Then move x2 to x1.
+			cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1))
+			linkInstr(cur, prevNext)
+			// Then reload the original value on x1 from stack to r2.
+			m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+		}
+	}
+}
+
+// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
+func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+	cur := end
+	for cur.kind == nop0 {
+		cur = cur.prev
+		if cur == begin {
+			return end
+		}
+	}
+	switch cur.kind {
+	case jmp:
+		return cur
+	default:
+		return end
+	}
+}
+
+// SSABlockLabel implements backend.RegAllocFunctionMachine.
+func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
+	return m.ectx.SsaBlockIDToLabels[id]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/machine_vec.go
@ -0,0 +1,992 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+var swizzleMask = [16]byte{
+	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+}
+
+func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) {
+	masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:])
+
+	// Load mask to maskReg.
+	maskReg := m.c.AllocateVReg(ssa.TypeV128)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg)
+	m.insert(loadMask)
+
+	// Copy x and y to tmp registers.
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	tmpDst := m.copyToTmp(xx.reg())
+	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
+	tmpX := m.copyToTmp(yy.reg())
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst))
+
+	// Copy the result to the destination register.
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) {
+	// Copy x to tmp.
+	tmpDst := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst))
+
+	yy := m.getOperand_Reg(m.c.ValueDefinition(y))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst))
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst))
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst))
+	case ssa.VecLaneF32x4:
+		// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
+		// See https://www.felixcloutier.com/x86/insertps
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst))
+	case ssa.VecLaneF64x2:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) {
+	// Pextr variants are used to extract a lane from a vector register.
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+	tmpDst := m.c.AllocateVReg(ret.Type())
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst))
+		if signed {
+			m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
+		}
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst))
+		if signed {
+			m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
+		}
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst))
+	case ssa.VecLaneF32x4:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst))
+		} else {
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst))
+		}
+	case ssa.VecLaneF64x2:
+		if index == 0 {
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
+		} else {
+			m.copyTo(xx.reg(), tmpDst)
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+var sqmulRoundSat = [16]byte{
+	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+}
+
+func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) {
+	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
+	maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:])
+
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp)
+	m.insert(loadMask)
+
+	xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	tmpX := m.copyToTmp(xx.reg())
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX))
+
+	m.copyTo(tmpX, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) {
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.lowerVUshri8x16(x, y, ret)
+	case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2:
+		m.lowerShr(x, y, ret, lane, false)
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
+// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
+var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
+	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
+	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
+	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
+	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
+}
+
+func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) {
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, 0x7, false)
+	// Take the modulo 8 of the shift amount.
+	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false))
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false))
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx))
+
+	maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:])
+	base := m.c.AllocateVReg(ssa.TypeI64)
+	lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
+	m.insert(lea)
+
+	// Shift tmpGpReg by 4 to multiply the shift amount by 16.
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
+
+	mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
+	loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp)
+	m.insert(loadMask)
+
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) {
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.lowerVSshri8x16(x, y, ret)
+	case ssa.VecLaneI16x8, ssa.VecLaneI32x4:
+		m.lowerShr(x, y, ret, lane, true)
+	case ssa.VecLaneI64x2:
+		m.lowerVSshri64x2(x, y, ret)
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) {
+	shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(shiftAmtReg, 0x7, false)
+	// Take the modulo 8 of the shift amount.
+	shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false))
+
+	// Copy the x value to two temporary registers.
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	vecTmp := m.c.AllocateVReg(ssa.TypeV128)
+	m.copyTo(xx, vecTmp)
+
+	// Assuming that we have
+	//  xx   = [b1, ..., b16]
+	//  vecTmp = [b1, ..., b16]
+	// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
+	//  xx   = [b1, b1, b2, b2, ..., b8, b8]
+	//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp))
+
+	// Adding 8 to the shift amount, and then move the amount to vecTmp2.
+	vecTmp2 := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false))
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false))
+
+	// Perform the word packed arithmetic right shifts on vreg and vecTmp.
+	// This changes these two registers as:
+	//  xx   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
+	//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
+	// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx))
+	m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp))
+
+	// Finally, we can get the result by packing these two word vectors.
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx))
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) {
+	// Load the shift amount to RCX.
+	shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg))
+
+	tmpGp := m.c.AllocateVReg(ssa.TypeI64)
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xxReg := m.copyToTmp(_xx.reg())
+
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp))
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp))
+	m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg))
+
+	m.copyTo(xxReg, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	var modulo uint64
+	var shiftOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI16x8:
+		modulo = 0xf
+		if signed {
+			shiftOp = sseOpcodePsraw
+		} else {
+			shiftOp = sseOpcodePsrlw
+		}
+	case ssa.VecLaneI32x4:
+		modulo = 0x1f
+		if signed {
+			shiftOp = sseOpcodePsrad
+		} else {
+			shiftOp = sseOpcodePsrld
+		}
+	case ssa.VecLaneI64x2:
+		modulo = 0x3f
+		if signed {
+			panic("BUG")
+		}
+		shiftOp = sseOpcodePsrlq
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, modulo, false)
+	// Take the modulo 8 of the shift amount.
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
+		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
+	// And move it to a xmm register.
+	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
+
+	// Then do the actual shift.
+	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) {
+	var modulo uint64
+	var shiftOp sseOpcode
+	var isI8x16 bool
+	switch lane {
+	case ssa.VecLaneI8x16:
+		isI8x16 = true
+		modulo = 0x7
+		shiftOp = sseOpcodePsllw
+	case ssa.VecLaneI16x8:
+		modulo = 0xf
+		shiftOp = sseOpcodePsllw
+	case ssa.VecLaneI32x4:
+		modulo = 0x1f
+		shiftOp = sseOpcodePslld
+	case ssa.VecLaneI64x2:
+		modulo = 0x3f
+		shiftOp = sseOpcodePsllq
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
+	// Load the modulo 8 mask to tmpReg.
+	m.lowerIconst(tmpGpReg, modulo, false)
+	// Take the modulo 8 of the shift amount.
+	m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
+		m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
+	// And move it to a xmm register.
+	tmpVec := m.c.AllocateVReg(ssa.TypeV128)
+	m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
+
+	// Then do the actual shift.
+	m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
+
+	if isI8x16 {
+		maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:])
+		base := m.c.AllocateVReg(ssa.TypeI64)
+		lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
+		m.insert(lea)
+
+		// Shift tmpGpReg by 4 to multiply the shift amount by 16.
+		m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
+
+		mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
+		loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec)
+		m.insert(loadMask)
+
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx))
+	}
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
+// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
+var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
+	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
+	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
+	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
+	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
+	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
+}
+
+func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) {
+	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+	var round sseOpcode
+	if _64 {
+		round = sseOpcodeRoundpd
+	} else {
+		round = sseOpcodeRoundps
+	}
+	m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret)))
+}
+
+var (
+	allOnesI8x16              = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
+	allOnesI16x8              = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
+	extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80}
+	extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00}
+)
+
+func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	switch srcLane {
+	case ssa.VecLaneI8x16:
+		allOneReg := m.c.AllocateVReg(ssa.TypeV128)
+		mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:])
+		m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg))
+
+		var resultReg regalloc.VReg
+		if signed {
+			resultReg = allOneReg
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg))
+		} else {
+			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
+			resultReg = xx
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg))
+		}
+		m.copyTo(resultReg, m.c.VRegOf(ret))
+
+	case ssa.VecLaneI16x8:
+		if signed {
+			allOnesReg := m.c.AllocateVReg(ssa.TypeV128)
+			mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx))
+			m.copyTo(xx, m.c.VRegOf(ret))
+		} else {
+			maskReg := m.c.AllocateVReg(ssa.TypeV128)
+			mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// Flip the sign bits on xx.
+			//
+			// Assuming that xx = [w1, ..., w8], now we have,
+			// 	xx[i] = int8(-w1) for i = 0...8
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx))
+
+			mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// For i = 0,..4 (as this results in i32x4 lanes), now we have
+			// xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
+			// c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx))
+
+			mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
+
+			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
+			// c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx))
+
+			m.copyTo(xx, m.c.VRegOf(ret))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", srcLane))
+	}
+}
+
+func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI8x16:
+		if signed {
+			sseOp = sseOpcodePmovsxbw
+		} else {
+			sseOp = sseOpcodePmovzxbw
+		}
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePmovsxwd
+		} else {
+			sseOp = sseOpcodePmovzxwd
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePmovsxdq
+		} else {
+			sseOp = sseOpcodePmovzxdq
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret)))
+}
+
+func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	tmp := m.c.AllocateVReg(ssa.TypeV128)
+	xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	m.copyTo(xx.reg(), tmp)
+	m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp))
+
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI8x16:
+		if signed {
+			sseOp = sseOpcodePmovsxbw
+		} else {
+			sseOp = sseOpcodePmovzxbw
+		}
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePmovsxwd
+		} else {
+			sseOp = sseOpcodePmovzxwd
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePmovsxdq
+		} else {
+			sseOp = sseOpcodePmovzxdq
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret)))
+}
+
+func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) {
+	tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64)
+	am := newOperandMem(m.lowerToAddressMode(ptr, offset))
+
+	m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
+	switch lane {
+	case ssa.VecLaneI8x16:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst))
+		tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128)
+		m.insert(m.allocateInstr().asZeros(tmpZeroVec))
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst))
+	case ssa.VecLaneI16x8:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneI32x4:
+		m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
+	case ssa.VecLaneI64x2:
+		m.insert(m.allocateInstr().asMov64MR(am, tmpGp))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst))
+		m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst))
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(tmpDst, m.c.VRegOf(ret))
+}
+
+var f64x2CvtFromIMask = [16]byte{
+	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+}
+
+func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	switch lane {
+	case ssa.VecLaneF32x4:
+		if signed {
+			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret)))
+		} else {
+			xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			// Copy the value to two temporary registers.
+			tmp := m.copyToTmp(xx.reg())
+			tmp2 := m.copyToTmp(xx.reg())
+
+			// Clear the higher 16 bits of each 32-bit element.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp))
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp))
+
+			// Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2))
+
+			// Convert the lower 16-bits in tmp.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
+
+			// Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2))
+
+			// Double the converted halved higher 16bits.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2))
+
+			// Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2))
+
+			m.copyTo(tmp2, m.c.VRegOf(ret))
+		}
+	case ssa.VecLaneF64x2:
+		if signed {
+			xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret)))
+		} else {
+			maskReg := m.c.AllocateVReg(ssa.TypeV128)
+			maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:])
+			// maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
+
+			_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+			xx := m.copyToTmp(_xx.reg())
+
+			// Given that we have xx = [d1, d2, d3, d4], this results in
+			//	xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
+			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
+			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx))
+
+			// maskReg = [float64(0x1.0p52), float64(0x1.0p52)]
+			maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
+
+			// Now, we get the result as
+			// 	xx = [float64(uint32(d1)), float64(uint32(d2))]
+			// because the following equality always satisfies:
+			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx))
+
+			m.copyTo(xx, m.c.VRegOf(ret))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+}
+
+var (
+	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
+	i32sMaxOnF64x2 = [16]byte{
+		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+	}
+
+	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
+	i32uMaxOnF64x2 = [16]byte{
+		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+	}
+
+	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
+	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
+	// like addition or subtraction, the resulted floating point holds exactly the same
+	// bit representations in 32-bit integer on its mantissa.
+	//
+	// Note: the name twop52 is common across various compiler ecosystem.
+	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
+	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
+	twop52 = [16]byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+	}
+)
+
+func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+
+	switch lane {
+	case ssa.VecLaneF32x4:
+		if signed {
+			tmp := m.copyToTmp(xx)
+
+			// Assuming we have xx = [v1, v2, v3, v4].
+			//
+			// Set all bits if lane is not NaN on tmp.
+			// tmp[i] = 0xffffffff  if vi != NaN
+			//        = 0           if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
+
+			// Clear NaN lanes on xx, meaning that
+			// 	xx[i] = vi  if vi != NaN
+			//	        0   if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx))
+
+			// tmp[i] = ^vi         if vi != NaN
+			//        = 0xffffffff  if vi == NaN
+			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp))
+
+			// xx[i] = int32(vi)   if vi != NaN and xx is not overflowing.
+			//       = 0x80000000  if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
+			//       = 0           if vi == NaN
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
+
+			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
+			//
+			// tmp[i] = 0x80000000                         if vi is positive
+			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp))
+
+			// Arithmetic right shifting tmp by 31, meaning that we have
+			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp))
+
+			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx))
+		} else {
+			tmp := m.c.AllocateVReg(ssa.TypeV128)
+			m.insert(m.allocateInstr().asZeros(tmp))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp))
+			m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
+			tmp2 := m.copyToTmp(xx)
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp))
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2))
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx))
+		}
+
+	case ssa.VecLaneF64x2:
+		tmp2 := m.c.AllocateVReg(ssa.TypeV128)
+		if signed {
+			tmp := m.copyToTmp(xx)
+
+			// Set all bits for non-NaN lanes, zeros otherwise.
+			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
+
+			maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:])
+			// Load the 2147483647 into tmp2's each lane.
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2))
+
+			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp))
+
+			// MINPD returns the source register's value as-is, so we have
+			//  xx[i] = vi   if vi != NaN
+			//        = 0    if vi == NaN
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx))
+
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx))
+		} else {
+			tmp := m.c.AllocateVReg(ssa.TypeV128)
+			m.insert(m.allocateInstr().asZeros(tmp))
+
+			//  xx[i] = vi   if vi != NaN && vi > 0
+			//        = 0    if vi == NaN || vi <= 0
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx))
+
+			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
+			maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
+
+			// xx[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//       = 0    otherwise
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx))
+
+			// Round the floating points into integer.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx))
+
+			// tmp2[i] = float64(0x1.0p52)
+			maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
+			m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
+
+			// xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//       = 0                                       otherwise
+			//
+			// This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
+			m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx))
+
+			// At this point, we have
+			// 	xx  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
+			//  tmp = [0, 0, 0, 0]
+			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
+			//	xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0]
+			// meaning that for i = 0 and 1, we have
+			//  xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//        = 0          otherwise.
+			m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx))
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+
+	var sseOp sseOpcode
+	switch lane {
+	case ssa.VecLaneI16x8:
+		if signed {
+			sseOp = sseOpcodePacksswb
+		} else {
+			sseOp = sseOpcodePackuswb
+		}
+	case ssa.VecLaneI32x4:
+		if signed {
+			sseOp = sseOpcodePackssdw
+		} else {
+			sseOp = sseOpcodePackusdw
+		}
+	default:
+		panic(fmt.Sprintf("invalid lane type: %s", lane))
+	}
+	m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) {
+	_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+	xx := m.copyToTmp(_xx.reg())
+	yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
+	m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx))
+	m.copyTo(xx, m.c.VRegOf(ret))
+}
+
+func (m *machine) lowerVIabs(instr *ssa.Instruction) {
+	x, lane := instr.ArgWithLane()
+	rd := m.c.VRegOf(instr.Return())
+
+	if lane == ssa.VecLaneI64x2 {
+		_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+		blendReg := xmm0VReg
+		m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg))
+
+		tmp := m.copyToTmp(_xx.reg())
+		xx := m.copyToTmp(_xx.reg())
+
+		// Clear all bits on blendReg.
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg))
+		// Subtract xx from blendMaskReg.
+		m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg))
+		// Copy the subtracted value ^^ back into tmp.
+		m.copyTo(blendReg, xx)
+
+		m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx))
+
+		m.copyTo(xx, rd)
+	} else {
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI8x16:
+			vecOp = sseOpcodePabsb
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePabsw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePabsd
+		}
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+
+		i := m.allocateInstr()
+		i.asXmmUnaryRmR(vecOp, rn, rd)
+		m.insert(i)
+	}
+}
+
+func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) {
+	x := instr.Arg()
+	rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+	rd := m.c.VRegOf(instr.Return())
+
+	tmp1 := m.c.AllocateVReg(ssa.TypeV128)
+	m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f)
+
+	// Copy input into tmp2.
+	tmp2 := m.copyToTmp(rn.reg())
+
+	// Given that we have:
+	//  rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
+	//
+	// Take PAND on tmp1 and tmp2, so that we mask out all the higher bits.
+	//  tmp2 = [l1, ..., l16].
+	pand := m.allocateInstr()
+	pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2)
+	m.insert(pand)
+
+	// Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have
+	//  tmp3 = [h1, ...., h16].
+	tmp3 := m.copyToTmp(rn.reg())
+	psrlw := m.allocateInstr()
+	psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3)
+	m.insert(psrlw)
+
+	pand2 := m.allocateInstr()
+	pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3)
+	m.insert(pand2)
+
+	// Read the popcntTable into tmp4, and we have
+	//  tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
+	tmp4 := m.c.AllocateVReg(ssa.TypeV128)
+	m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01)
+
+	// Make a copy for later.
+	tmp5 := m.copyToTmp(tmp4)
+
+	//  tmp4 = [popcnt(l1), ..., popcnt(l16)].
+	pshufb := m.allocateInstr()
+	pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4)
+	m.insert(pshufb)
+
+	pshufb2 := m.allocateInstr()
+	pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5)
+	m.insert(pshufb2)
+
+	// tmp4 + tmp5 is the result.
+	paddb := m.allocateInstr()
+	paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5)
+	m.insert(paddb)
+
+	m.copyTo(tmp5, rd)
+}
+
+func (m *machine) lowerVImul(instr *ssa.Instruction) {
+	x, y, lane := instr.Arg2WithLane()
+	rd := m.c.VRegOf(instr.Return())
+	if lane == ssa.VecLaneI64x2 {
+		rn := m.getOperand_Reg(m.c.ValueDefinition(x))
+		rm := m.getOperand_Reg(m.c.ValueDefinition(y))
+		// Assuming that we have
+		//	rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
+		//  rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
+		// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
+
+		// Copy rn into tmp1.
+		tmp1 := m.copyToTmp(rn.reg())
+
+		// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
+		shift := m.allocateInstr()
+		shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1)
+		m.insert(shift)
+
+		// Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
+		mul := m.allocateInstr()
+		mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1)
+		m.insert(mul)
+
+		// Copy rm value into tmp2.
+		tmp2 := m.copyToTmp(rm.reg())
+
+		// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
+		shift2 := m.allocateInstr()
+		shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2)
+		m.insert(shift2)
+
+		// Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
+		mul2 := m.allocateInstr()
+		mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2)
+		m.insert(mul2)
+
+		// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
+		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
+		add := m.allocateInstr()
+		add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1)
+		m.insert(add)
+
+		shift3 := m.allocateInstr()
+		shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1)
+		m.insert(shift3)
+
+		// Copy rm value into tmp3.
+		tmp3 := m.copyToTmp(rm.reg())
+
+		// "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
+		mul3 := m.allocateInstr()
+		mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3)
+		m.insert(mul3)
+
+		// Finally, we get the result by computing tmp1 + tmp3,
+		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
+		add2 := m.allocateInstr()
+		add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1)
+		m.insert(add2)
+
+		m.copyTo(tmp1, rd)
+
+	} else {
+		var vecOp sseOpcode
+		switch lane {
+		case ssa.VecLaneI16x8:
+			vecOp = sseOpcodePmullw
+		case ssa.VecLaneI32x4:
+			vecOp = sseOpcodePmulld
+		default:
+			panic("unsupported: " + lane.String())
+		}
+		m.lowerVbBinOp(vecOp, x, y, instr.Return())
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/operands.go
@ -0,0 +1,346 @@
+package amd64
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type operand struct {
+	kind operandKind
+	data uint64
+}
+
+type operandKind byte
+
+const (
+	// operandKindReg is an operand which is an integer Register.
+	operandKindReg operandKind = iota + 1
+
+	// operandKindMem is a value in Memory.
+	// 32, 64, or 128 bit value.
+	operandKindMem
+
+	// operandKindImm32 is a signed-32-bit integer immediate value.
+	operandKindImm32
+
+	// operandKindLabel is a label.
+	operandKindLabel
+)
+
+// String implements fmt.Stringer.
+func (o operandKind) String() string {
+	switch o {
+	case operandKindReg:
+		return "reg"
+	case operandKindMem:
+		return "mem"
+	case operandKindImm32:
+		return "imm32"
+	case operandKindLabel:
+		return "label"
+	default:
+		panic("BUG: invalid operand kind")
+	}
+}
+
+// format returns the string representation of the operand.
+// _64 is only for the case where the operand is a register, and it's integer.
+func (o *operand) format(_64 bool) string {
+	switch o.kind {
+	case operandKindReg:
+		return formatVRegSized(o.reg(), _64)
+	case operandKindMem:
+		return o.addressMode().String()
+	case operandKindImm32:
+		return fmt.Sprintf("$%d", int32(o.imm32()))
+	case operandKindLabel:
+		return backend.Label(o.imm32()).String()
+	default:
+		panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind))
+	}
+}
+
+//go:inline
+func (o *operand) reg() regalloc.VReg {
+	return regalloc.VReg(o.data)
+}
+
+//go:inline
+func (o *operand) setReg(r regalloc.VReg) {
+	o.data = uint64(r)
+}
+
+//go:inline
+func (o *operand) addressMode() *amode {
+	return wazevoapi.PtrFromUintptr[amode](uintptr(o.data))
+}
+
+//go:inline
+func (o *operand) imm32() uint32 {
+	return uint32(o.data)
+}
+
+func (o *operand) label() backend.Label {
+	switch o.kind {
+	case operandKindLabel:
+		return backend.Label(o.data)
+	case operandKindMem:
+		mem := o.addressMode()
+		if mem.kind() != amodeRipRel {
+			panic("BUG: invalid label")
+		}
+		return backend.Label(mem.imm32)
+	default:
+		panic("BUG: invalid operand kind")
+	}
+}
+
+func newOperandLabel(label backend.Label) operand {
+	return operand{kind: operandKindLabel, data: uint64(label)}
+}
+
+func newOperandReg(r regalloc.VReg) operand {
+	return operand{kind: operandKindReg, data: uint64(r)}
+}
+
+func newOperandImm32(imm32 uint32) operand {
+	return operand{kind: operandKindImm32, data: uint64(imm32)}
+}
+
+func newOperandMem(amode *amode) operand {
+	return operand{kind: operandKindMem, data: uint64(uintptr(unsafe.Pointer(amode)))}
+}
+
+// amode is a memory operand (addressing mode).
+type amode struct {
+	kindWithShift uint32
+	imm32         uint32
+	base          regalloc.VReg
+
+	// For amodeRegRegShift:
+	index regalloc.VReg
+}
+
+type amodeKind byte
+
+const (
+	// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base
+	amodeImmReg amodeKind = iota + 1
+
+	// amodeImmRBP is the same as amodeImmReg, but the base register is fixed to RBP.
+	// The only differece is that it doesn't tell the register allocator to use RBP which is distracting for the
+	// register allocator.
+	amodeImmRBP
+
+	// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + (Register2 << Shift)
+	amodeRegRegShift
+
+	// amodeRipRel is a RIP-relative addressing mode specified by the label.
+	amodeRipRel
+
+	// TODO: there are other addressing modes such as the one without base register.
+)
+
+func (a *amode) kind() amodeKind {
+	return amodeKind(a.kindWithShift & 0xff)
+}
+
+func (a *amode) shift() byte {
+	return byte(a.kindWithShift >> 8)
+}
+
+func (a *amode) uses(rs *[]regalloc.VReg) {
+	switch a.kind() {
+	case amodeImmReg:
+		*rs = append(*rs, a.base)
+	case amodeRegRegShift:
+		*rs = append(*rs, a.base, a.index)
+	case amodeImmRBP, amodeRipRel:
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (a *amode) nregs() int {
+	switch a.kind() {
+	case amodeImmReg:
+		return 1
+	case amodeRegRegShift:
+		return 2
+	case amodeImmRBP, amodeRipRel:
+		return 0
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (a *amode) assignUses(i int, reg regalloc.VReg) {
+	switch a.kind() {
+	case amodeImmReg:
+		if i == 0 {
+			a.base = reg
+		} else {
+			panic("BUG: invalid amode assignment")
+		}
+	case amodeRegRegShift:
+		if i == 0 {
+			a.base = reg
+		} else if i == 1 {
+			a.index = reg
+		} else {
+			panic("BUG: invalid amode assignment")
+		}
+	default:
+		panic("BUG: invalid amode assignment")
+	}
+}
+
+func (m *machine) newAmodeImmReg(imm32 uint32, base regalloc.VReg) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeImmReg), imm32: imm32, base: base}
+	return ret
+}
+
+func (m *machine) newAmodeImmRBPReg(imm32 uint32) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeImmRBP), imm32: imm32, base: rbpVReg}
+	return ret
+}
+
+func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, shift byte) *amode {
+	if shift > 3 {
+		panic(fmt.Sprintf("BUG: invalid shift (must be 3>=): %d", shift))
+	}
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeRegRegShift) | uint32(shift)<<8, imm32: imm32, base: base, index: index}
+	return ret
+}
+
+func (m *machine) newAmodeRipRel(label backend.Label) *amode {
+	ret := m.amodePool.Allocate()
+	*ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)}
+	return ret
+}
+
+// String implements fmt.Stringer.
+func (a *amode) String() string {
+	switch a.kind() {
+	case amodeImmReg, amodeImmRBP:
+		if a.imm32 == 0 {
+			return fmt.Sprintf("(%s)", formatVRegSized(a.base, true))
+		}
+		return fmt.Sprintf("%d(%s)", int32(a.imm32), formatVRegSized(a.base, true))
+	case amodeRegRegShift:
+		shift := 1 << a.shift()
+		if a.imm32 == 0 {
+			return fmt.Sprintf(
+				"(%s,%s,%d)",
+				formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
+		}
+		return fmt.Sprintf(
+			"%d(%s,%s,%d)",
+			int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
+	case amodeRipRel:
+		return fmt.Sprintf("%s(%%rip)", backend.Label(a.imm32))
+	default:
+		panic("BUG: invalid amode kind")
+	}
+}
+
+func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	if def.SSAValue().Type() == ssa.TypeV128 {
+		// SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment.
+		return m.getOperand_Reg(def)
+	}
+
+	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
+		instr := def.Instr
+		ptr, offset, _ := instr.LoadData()
+		op = newOperandMem(m.lowerToAddressMode(ptr, offset))
+		instr.MarkLowered()
+		return op
+	}
+	return m.getOperand_Reg(def)
+}
+
+func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	if m.c.MatchInstr(def, ssa.OpcodeLoad) {
+		instr := def.Instr
+		ptr, offset, _ := instr.LoadData()
+		op = newOperandMem(m.lowerToAddressMode(ptr, offset))
+		instr.MarkLowered()
+		return op
+	}
+	return m.getOperand_Imm32_Reg(def)
+}
+
+func (m *machine) getOperand_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
+	if def.IsFromBlockParam() {
+		return newOperandReg(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Constant() {
+		// If the operation is 64-bit, x64 sign-extends the 32-bit immediate value.
+		// Therefore, we need to check if the immediate value is within the 32-bit range and if the sign bit is set,
+		// we should not use the immediate value.
+		if op, ok := asImm32Operand(instr.ConstantVal(), instr.Return().Type() == ssa.TypeI32); ok {
+			instr.MarkLowered()
+			return op
+		}
+	}
+	return m.getOperand_Reg(def)
+}
+
+func asImm32Operand(val uint64, allowSignExt bool) (operand, bool) {
+	if imm32, ok := asImm32(val, allowSignExt); ok {
+		return newOperandImm32(imm32), true
+	}
+	return operand{}, false
+}
+
+func asImm32(val uint64, allowSignExt bool) (uint32, bool) {
+	u32val := uint32(val)
+	if uint64(u32val) != val {
+		return 0, false
+	}
+	if !allowSignExt && u32val&0x80000000 != 0 {
+		return 0, false
+	}
+	return u32val, true
+}
+
+func (m *machine) getOperand_Reg(def *backend.SSAValueDefinition) (op operand) {
+	var v regalloc.VReg
+	if def.IsFromBlockParam() {
+		v = def.BlkParamVReg
+	} else {
+		instr := def.Instr
+		if instr.Constant() {
+			// We inline all the constant instructions so that we could reduce the register usage.
+			v = m.lowerConstant(instr)
+			instr.MarkLowered()
+		} else {
+			if n := def.N; n == 0 {
+				v = m.c.VRegOf(instr.Return())
+			} else {
+				_, rs := instr.Returns()
+				v = m.c.VRegOf(rs[n-1])
+			}
+		}
+	}
+	return newOperandReg(v)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect.go
@ -0,0 +1,11 @@
+//go:build !tinygo
+
+package amd64
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
+	s.Len = int(limit)
+	s.Cap = int(limit)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reflect_tinygo.go
@ -0,0 +1,11 @@
+//go:build tinygo
+
+package amd64
+
+import "reflect"
+
+// setSliceLimits sets both Cap and Len for the given reflected slice.
+func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
+	s.Len = limit
+	s.Len = limit
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/reg.go
@ -0,0 +1,181 @@
+package amd64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// Amd64-specific registers.
+const (
+	// rax is a gp register.
+	rax = regalloc.RealRegInvalid + 1 + iota
+	// rcx is a gp register.
+	rcx
+	// rdx is a gp register.
+	rdx
+	// rbx is a gp register.
+	rbx
+	// rsp is a gp register.
+	rsp
+	// rbp is a gp register.
+	rbp
+	// rsi is a gp register.
+	rsi
+	// rdi is a gp register.
+	rdi
+	// r8 is a gp register.
+	r8
+	// r9 is a gp register.
+	r9
+	// r10 is a gp register.
+	r10
+	// r11 is a gp register.
+	r11
+	// r12 is a gp register.
+	r12
+	// r13 is a gp register.
+	r13
+	// r14 is a gp register.
+	r14
+	// r15 is a gp register.
+	r15
+
+	// xmm0 is a vector register.
+	xmm0
+	// xmm1 is a vector register.
+	xmm1
+	// xmm2 is a vector register.
+	xmm2
+	// xmm3 is a vector register.
+	xmm3
+	// xmm4 is a vector register.
+	xmm4
+	// xmm5 is a vector register.
+	xmm5
+	// xmm6 is a vector register.
+	xmm6
+	// xmm7 is a vector register.
+	xmm7
+	// xmm8 is a vector register.
+	xmm8
+	// xmm9 is a vector register.
+	xmm9
+	// xmm10 is a vector register.
+	xmm10
+	// xmm11 is a vector register.
+	xmm11
+	// xmm12 is a vector register.
+	xmm12
+	// xmm13 is a vector register.
+	xmm13
+	// xmm14 is a vector register.
+	xmm14
+	// xmm15 is a vector register.
+	xmm15
+)
+
+var (
+	raxVReg = regalloc.FromRealReg(rax, regalloc.RegTypeInt)
+	rcxVReg = regalloc.FromRealReg(rcx, regalloc.RegTypeInt)
+	rdxVReg = regalloc.FromRealReg(rdx, regalloc.RegTypeInt)
+	rbxVReg = regalloc.FromRealReg(rbx, regalloc.RegTypeInt)
+	rspVReg = regalloc.FromRealReg(rsp, regalloc.RegTypeInt)
+	rbpVReg = regalloc.FromRealReg(rbp, regalloc.RegTypeInt)
+	rsiVReg = regalloc.FromRealReg(rsi, regalloc.RegTypeInt)
+	rdiVReg = regalloc.FromRealReg(rdi, regalloc.RegTypeInt)
+	r8VReg  = regalloc.FromRealReg(r8, regalloc.RegTypeInt)
+	r9VReg  = regalloc.FromRealReg(r9, regalloc.RegTypeInt)
+	r10VReg = regalloc.FromRealReg(r10, regalloc.RegTypeInt)
+	r11VReg = regalloc.FromRealReg(r11, regalloc.RegTypeInt)
+	r12VReg = regalloc.FromRealReg(r12, regalloc.RegTypeInt)
+	r13VReg = regalloc.FromRealReg(r13, regalloc.RegTypeInt)
+	r14VReg = regalloc.FromRealReg(r14, regalloc.RegTypeInt)
+	r15VReg = regalloc.FromRealReg(r15, regalloc.RegTypeInt)
+
+	xmm0VReg  = regalloc.FromRealReg(xmm0, regalloc.RegTypeFloat)
+	xmm1VReg  = regalloc.FromRealReg(xmm1, regalloc.RegTypeFloat)
+	xmm2VReg  = regalloc.FromRealReg(xmm2, regalloc.RegTypeFloat)
+	xmm3VReg  = regalloc.FromRealReg(xmm3, regalloc.RegTypeFloat)
+	xmm4VReg  = regalloc.FromRealReg(xmm4, regalloc.RegTypeFloat)
+	xmm5VReg  = regalloc.FromRealReg(xmm5, regalloc.RegTypeFloat)
+	xmm6VReg  = regalloc.FromRealReg(xmm6, regalloc.RegTypeFloat)
+	xmm7VReg  = regalloc.FromRealReg(xmm7, regalloc.RegTypeFloat)
+	xmm8VReg  = regalloc.FromRealReg(xmm8, regalloc.RegTypeFloat)
+	xmm9VReg  = regalloc.FromRealReg(xmm9, regalloc.RegTypeFloat)
+	xmm10VReg = regalloc.FromRealReg(xmm10, regalloc.RegTypeFloat)
+	xmm11VReg = regalloc.FromRealReg(xmm11, regalloc.RegTypeFloat)
+	xmm12VReg = regalloc.FromRealReg(xmm12, regalloc.RegTypeFloat)
+	xmm13VReg = regalloc.FromRealReg(xmm13, regalloc.RegTypeFloat)
+	xmm14VReg = regalloc.FromRealReg(xmm14, regalloc.RegTypeFloat)
+	xmm15VReg = regalloc.FromRealReg(xmm15, regalloc.RegTypeFloat)
+)
+
+var regNames = [...]string{
+	rax:   "rax",
+	rcx:   "rcx",
+	rdx:   "rdx",
+	rbx:   "rbx",
+	rsp:   "rsp",
+	rbp:   "rbp",
+	rsi:   "rsi",
+	rdi:   "rdi",
+	r8:    "r8",
+	r9:    "r9",
+	r10:   "r10",
+	r11:   "r11",
+	r12:   "r12",
+	r13:   "r13",
+	r14:   "r14",
+	r15:   "r15",
+	xmm0:  "xmm0",
+	xmm1:  "xmm1",
+	xmm2:  "xmm2",
+	xmm3:  "xmm3",
+	xmm4:  "xmm4",
+	xmm5:  "xmm5",
+	xmm6:  "xmm6",
+	xmm7:  "xmm7",
+	xmm8:  "xmm8",
+	xmm9:  "xmm9",
+	xmm10: "xmm10",
+	xmm11: "xmm11",
+	xmm12: "xmm12",
+	xmm13: "xmm13",
+	xmm14: "xmm14",
+	xmm15: "xmm15",
+}
+
+func formatVRegSized(r regalloc.VReg, _64 bool) string {
+	if r.IsRealReg() {
+		if r.RegType() == regalloc.RegTypeInt {
+			rr := r.RealReg()
+			orig := regNames[rr]
+			if rr <= rdi {
+				if _64 {
+					return "%" + orig
+				} else {
+					return "%e" + orig[1:]
+				}
+			} else {
+				if _64 {
+					return "%" + orig
+				} else {
+					return "%" + orig + "d"
+				}
+			}
+		} else {
+			return "%" + regNames[r.RealReg()]
+		}
+	} else {
+		if r.RegType() == regalloc.RegTypeInt {
+			if _64 {
+				return fmt.Sprintf("%%r%d?", r.ID())
+			} else {
+				return fmt.Sprintf("%%r%dd?", r.ID())
+			}
+		} else {
+			return fmt.Sprintf("%%xmm%d?", r.ID())
+		}
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/amd64/stack.go
@ -0,0 +1,128 @@
+package amd64
+
+import (
+	"encoding/binary"
+	"reflect"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+)
+
+func stackView(rbp, top uintptr) []byte {
+	var stackBuf []byte
+	{
+		// TODO: use unsafe.Slice after floor version is set to Go 1.20.
+		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
+		hdr.Data = rbp
+		setSliceLimits(hdr, top-rbp)
+	}
+	return stackBuf
+}
+
+// UnwindStack implements wazevo.unwindStack.
+func UnwindStack(_, rbp, top uintptr, returnAddresses []uintptr) []uintptr {
+	stackBuf := stackView(rbp, top)
+
+	for i := uint64(0); i < uint64(len(stackBuf)); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |
+		//    |     .......     |
+		//    |      ret 0      |
+		//    |      arg X      |
+		//    |     .......     |
+		//    |      arg 1      |
+		//    |      arg 0      |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- Caller_RBP
+		//    |   ...........   |
+		//    |   clobbered  M  |
+		//    |   ............  |
+		//    |   clobbered  0  |
+		//    |   spill slot N  |
+		//    |   ............  |
+		//    |   spill slot 0  |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- RBP
+		//       (low address)
+
+		callerRBP := binary.LittleEndian.Uint64(stackBuf[i:])
+		retAddr := binary.LittleEndian.Uint64(stackBuf[i+8:])
+		returnAddresses = append(returnAddresses, uintptr(retAddr))
+		i = callerRBP - uint64(rbp)
+		if len(returnAddresses) == wasmdebug.MaxFrames {
+			break
+		}
+	}
+	return returnAddresses
+}
+
+// GoCallStackView implements wazevo.goCallStackView.
+func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	//                  (high address)
+	//              +-----------------+ <----+
+	//              |   xxxxxxxxxxx   |      | ;; optional unused space to make it 16-byte aligned.
+	//           ^  |  arg[N]/ret[M]  |      |
+	// sliceSize |  |  ............   |      | SizeInBytes/8
+	//           |  |  arg[1]/ret[1]  |      |
+	//           v  |  arg[0]/ret[0]  | <----+
+	//              |   SizeInBytes   |
+	//              +-----------------+ <---- stackPointerBeforeGoCall
+	//                 (low address)
+	data := unsafe.Pointer(uintptr(unsafe.Pointer(stackPointerBeforeGoCall)) + 8)
+	size := *stackPointerBeforeGoCall / 8
+	return unsafe.Slice((*uint64)(data), int(size))
+}
+
+func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) {
+	diff := uint64(rsp - oldRsp)
+
+	newBuf := stackView(rbp, top)
+	for i := uint64(0); i < uint64(len(newBuf)); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |
+		//    |     .......     |
+		//    |      ret 0      |
+		//    |      arg X      |
+		//    |     .......     |
+		//    |      arg 1      |
+		//    |      arg 0      |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- Caller_RBP
+		//    |   ...........   |
+		//    |   clobbered  M  |
+		//    |   ............  |
+		//    |   clobbered  0  |
+		//    |   spill slot N  |
+		//    |   ............  |
+		//    |   spill slot 0  |
+		//    |  ReturnAddress  |
+		//    |   Caller_RBP    |
+		//    +-----------------+ <---- RBP
+		//       (low address)
+
+		callerRBP := binary.LittleEndian.Uint64(newBuf[i:])
+		if callerRBP == 0 {
+			// End of stack.
+			break
+		}
+		if i64 := int64(callerRBP); i64 < int64(oldRsp) || i64 >= int64(oldTop) {
+			panic("BUG: callerRBP is out of range")
+		}
+		if int(callerRBP) < 0 {
+			panic("BUG: callerRBP is negative")
+		}
+		adjustedCallerRBP := callerRBP + diff
+		if int(adjustedCallerRBP) < 0 {
+			panic("BUG: adjustedCallerRBP is negative")
+		}
+		binary.LittleEndian.PutUint64(newBuf[i:], adjustedCallerRBP)
+		i = adjustedCallerRBP - uint64(rbp)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi.go
@ -0,0 +1,332 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// References:
+// * https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#arm64-architecture
+// * https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
+
+var (
+	intParamResultRegs   = []regalloc.RealReg{x0, x1, x2, x3, x4, x5, x6, x7}
+	floatParamResultRegs = []regalloc.RealReg{v0, v1, v2, v3, v4, v5, v6, v7}
+)
+
+var regInfo = &regalloc.RegisterInfo{
+	AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
+		// We don't allocate:
+		// - x18: Reserved by the macOS: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Respect-the-purpose-of-specific-CPU-registers
+		// - x28: Reserved by Go runtime.
+		// - x27(=tmpReg): because of the reason described on tmpReg.
+		regalloc.RegTypeInt: {
+			x8, x9, x10, x11, x12, x13, x14, x15,
+			x16, x17, x19, x20, x21, x22, x23, x24, x25,
+			x26, x29, x30,
+			// These are the argument/return registers. Less preferred in the allocation.
+			x7, x6, x5, x4, x3, x2, x1, x0,
+		},
+		regalloc.RegTypeFloat: {
+			v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+			v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30,
+			// These are the argument/return registers. Less preferred in the allocation.
+			v7, v6, v5, v4, v3, v2, v1, v0,
+		},
+	},
+	CalleeSavedRegisters: regalloc.NewRegSet(
+		x19, x20, x21, x22, x23, x24, x25, x26, x28,
+		v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+	),
+	CallerSavedRegisters: regalloc.NewRegSet(
+		x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x29, x30,
+		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+	),
+	RealRegToVReg: []regalloc.VReg{
+		x0: x0VReg, x1: x1VReg, x2: x2VReg, x3: x3VReg, x4: x4VReg, x5: x5VReg, x6: x6VReg, x7: x7VReg, x8: x8VReg, x9: x9VReg, x10: x10VReg, x11: x11VReg, x12: x12VReg, x13: x13VReg, x14: x14VReg, x15: x15VReg, x16: x16VReg, x17: x17VReg, x18: x18VReg, x19: x19VReg, x20: x20VReg, x21: x21VReg, x22: x22VReg, x23: x23VReg, x24: x24VReg, x25: x25VReg, x26: x26VReg, x27: x27VReg, x28: x28VReg, x29: x29VReg, x30: x30VReg,
+		v0: v0VReg, v1: v1VReg, v2: v2VReg, v3: v3VReg, v4: v4VReg, v5: v5VReg, v6: v6VReg, v7: v7VReg, v8: v8VReg, v9: v9VReg, v10: v10VReg, v11: v11VReg, v12: v12VReg, v13: v13VReg, v14: v14VReg, v15: v15VReg, v16: v16VReg, v17: v17VReg, v18: v18VReg, v19: v19VReg, v20: v20VReg, v21: v21VReg, v22: v22VReg, v23: v23VReg, v24: v24VReg, v25: v25VReg, v26: v26VReg, v27: v27VReg, v28: v28VReg, v29: v29VReg, v30: v30VReg, v31: v31VReg,
+	},
+	RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
+	RealRegType: func(r regalloc.RealReg) regalloc.RegType {
+		if r < v0 {
+			return regalloc.RegTypeInt
+		}
+		return regalloc.RegTypeFloat
+	},
+}
+
+// ArgsResultsRegs implements backend.Machine.
+func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
+	return intParamResultRegs, floatParamResultRegs
+}
+
+// LowerParams implements backend.FunctionABI.
+func (m *machine) LowerParams(args []ssa.Value) {
+	a := m.currentABI
+
+	for i, ssaArg := range args {
+		if !ssaArg.Valid() {
+			continue
+		}
+		reg := m.compiler.VRegOf(ssaArg)
+		arg := &a.Args[i]
+		if arg.Kind == backend.ABIArgKindReg {
+			m.InsertMove(reg, arg.Reg, arg.Type)
+		} else {
+			// TODO: we could use pair load if there's consecutive loads for the same type.
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |
+			//          |      arg X      |
+			//          |     .......     |
+			//          |      arg 1      |
+			//          |      arg 0      |    <-|
+			//          |   ReturnAddress |      |
+			//          +-----------------+      |
+			//          |   ...........   |      |
+			//          |   clobbered  M  |      |   argStackOffset: is unknown at this point of compilation.
+			//          |   ............  |      |
+			//          |   clobbered  0  |      |
+			//          |   spill slot N  |      |
+			//          |   ...........   |      |
+			//          |   spill slot 0  |      |
+			//   SP---> +-----------------+    <-+
+			//             (low address)
+
+			bits := arg.Type.Bits()
+			// At this point of compilation, we don't yet know how much space exist below the return address.
+			// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
+			amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
+			load := m.allocateInstr()
+			switch arg.Type {
+			case ssa.TypeI32, ssa.TypeI64:
+				load.asULoad(operandNR(reg), amode, bits)
+			case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+				load.asFpuLoad(operandNR(reg), amode, bits)
+			default:
+				panic("BUG")
+			}
+			m.insert(load)
+			m.unresolvedAddressModes = append(m.unresolvedAddressModes, load)
+		}
+	}
+}
+
+// LowerReturns lowers the given returns.
+func (m *machine) LowerReturns(rets []ssa.Value) {
+	a := m.currentABI
+
+	l := len(rets) - 1
+	for i := range rets {
+		// Reverse order in order to avoid overwriting the stack returns existing in the return registers.
+		ret := rets[l-i]
+		r := &a.Rets[l-i]
+		reg := m.compiler.VRegOf(ret)
+		if def := m.compiler.ValueDefinition(ret); def.IsFromInstr() {
+			// Constant instructions are inlined.
+			if inst := def.Instr; inst.Constant() {
+				val := inst.Return()
+				valType := val.Type()
+				v := inst.ConstantVal()
+				m.insertLoadConstant(v, valType, reg)
+			}
+		}
+		if r.Kind == backend.ABIArgKindReg {
+			m.InsertMove(r.Reg, reg, ret.Type())
+		} else {
+			// TODO: we could use pair store if there's consecutive stores for the same type.
+			//
+			//            (high address)
+			//          +-----------------+
+			//          |     .......     |
+			//          |      ret Y      |
+			//          |     .......     |
+			//          |      ret 0      |    <-+
+			//          |      arg X      |      |
+			//          |     .......     |      |
+			//          |      arg 1      |      |
+			//          |      arg 0      |      |
+			//          |   ReturnAddress |      |
+			//          +-----------------+      |
+			//          |   ...........   |      |
+			//          |   spill slot M  |      |   retStackOffset: is unknown at this point of compilation.
+			//          |   ............  |      |
+			//          |   spill slot 2  |      |
+			//          |   spill slot 1  |      |
+			//          |   clobbered 0   |      |
+			//          |   clobbered 1   |      |
+			//          |   ...........   |      |
+			//          |   clobbered N   |      |
+			//   SP---> +-----------------+    <-+
+			//             (low address)
+
+			bits := r.Type.Bits()
+
+			// At this point of compilation, we don't yet know how much space exist below the return address.
+			// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
+			amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
+			store := m.allocateInstr()
+			store.asStore(operandNR(reg), amode, bits)
+			m.insert(store)
+			m.unresolvedAddressModes = append(m.unresolvedAddressModes, store)
+		}
+	}
+}
+
+// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
+// caller side of the function call.
+func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, slotBegin int64) {
+	arg := &a.Args[argIndex]
+	if def != nil && def.IsFromInstr() {
+		// Constant instructions are inlined.
+		if inst := def.Instr; inst.Constant() {
+			val := inst.Return()
+			valType := val.Type()
+			v := inst.ConstantVal()
+			m.insertLoadConstant(v, valType, reg)
+		}
+	}
+	if arg.Kind == backend.ABIArgKindReg {
+		m.InsertMove(arg.Reg, reg, arg.Type)
+	} else {
+		// TODO: we could use pair store if there's consecutive stores for the same type.
+		//
+		// Note that at this point, stack pointer is already adjusted.
+		bits := arg.Type.Bits()
+		amode := m.resolveAddressModeForOffset(arg.Offset-slotBegin, bits, spVReg, false)
+		store := m.allocateInstr()
+		store.asStore(operandNR(reg), amode, bits)
+		m.insert(store)
+	}
+}
+
+func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, slotBegin int64) {
+	r := &a.Rets[retIndex]
+	if r.Kind == backend.ABIArgKindReg {
+		m.InsertMove(reg, r.Reg, r.Type)
+	} else {
+		// TODO: we could use pair load if there's consecutive loads for the same type.
+		amode := m.resolveAddressModeForOffset(a.ArgStackSize+r.Offset-slotBegin, r.Type.Bits(), spVReg, false)
+		ldr := m.allocateInstr()
+		switch r.Type {
+		case ssa.TypeI32, ssa.TypeI64:
+			ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
+		default:
+			panic("BUG")
+		}
+		m.insert(ldr)
+	}
+}
+
+func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur, mode
+}
+
+func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
+	if rn.RegType() != regalloc.RegTypeInt {
+		panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
+	}
+	var amode addressMode
+	if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
+	} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
+		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
+	} else {
+		var indexReg regalloc.VReg
+		if allowTmpRegUse {
+			m.lowerConstantI64(tmpRegVReg, offset)
+			indexReg = tmpRegVReg
+		} else {
+			indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
+			m.lowerConstantI64(indexReg, offset)
+		}
+		amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
+	}
+	return amode
+}
+
+func (m *machine) lowerCall(si *ssa.Instruction) {
+	isDirectCall := si.Opcode() == ssa.OpcodeCall
+	var indirectCalleePtr ssa.Value
+	var directCallee ssa.FuncRef
+	var sigID ssa.SignatureID
+	var args []ssa.Value
+	if isDirectCall {
+		directCallee, sigID, args = si.CallData()
+	} else {
+		indirectCalleePtr, sigID, args, _ /* on arm64, the calling convention is compatible with the Go runtime */ = si.CallIndirectData()
+	}
+	calleeABI := m.compiler.GetFunctionABI(m.compiler.SSABuilder().ResolveSignature(sigID))
+
+	stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
+	if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
+		m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // return address frame.
+	}
+
+	for i, arg := range args {
+		reg := m.compiler.VRegOf(arg)
+		def := m.compiler.ValueDefinition(arg)
+		m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
+	}
+
+	if isDirectCall {
+		call := m.allocateInstr()
+		call.asCall(directCallee, calleeABI)
+		m.insert(call)
+	} else {
+		ptr := m.compiler.VRegOf(indirectCalleePtr)
+		callInd := m.allocateInstr()
+		callInd.asCallIndirect(ptr, calleeABI)
+		m.insert(callInd)
+	}
+
+	var index int
+	r1, rs := si.Returns()
+	if r1.Valid() {
+		m.callerGenFunctionReturnVReg(calleeABI, 0, m.compiler.VRegOf(r1), stackSlotSize)
+		index++
+	}
+
+	for _, r := range rs {
+		m.callerGenFunctionReturnVReg(calleeABI, index, m.compiler.VRegOf(r), stackSlotSize)
+		index++
+	}
+}
+
+func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) {
+	if imm12Operand, ok := asImm12Operand(uint64(diff)); ok {
+		alu := m.allocateInstr()
+		var ao aluOp
+		if add {
+			ao = aluOpAdd
+		} else {
+			ao = aluOpSub
+		}
+		alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
+		m.insert(alu)
+	} else {
+		m.lowerConstantI64(tmpRegVReg, diff)
+		alu := m.allocateInstr()
+		var ao aluOp
+		if add {
+			ao = aluOpAdd
+		} else {
+			ao = aluOpSub
+		}
+		alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		m.insert(alu)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.go
@ -0,0 +1,9 @@
+package arm64
+
+// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
+// This implements wazevo.entrypoint, and see the comments there for detail.
+func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
+
+// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
+// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
+func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_arm64.s
@ -0,0 +1,29 @@
+//go:build arm64
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// See the comments on EmitGoEntryPreamble for what this function is supposed to do.
+TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
+	MOVD preambleExecutable+0(FP), R27
+	MOVD functionExectuable+8(FP), R24
+	MOVD executionContextPtr+16(FP), R0
+	MOVD moduleContextPtr+24(FP), R1
+	MOVD paramResultSlicePtr+32(FP), R19
+	MOVD goAllocatedStackSlicePtr+40(FP), R26
+	JMP  (R27)
+
+TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
+	MOVD goCallReturnAddress+0(FP), R20
+	MOVD executionContextPtr+8(FP), R0
+	MOVD stackPointer+16(FP), R19
+
+	// Save the current FP(R29), SP and LR(R30) into the wazevo.executionContext (stored in R0).
+	MOVD R29, 16(R0) // Store FP(R29) into [RO, #ExecutionContextOffsets.OriginalFramePointer]
+	MOVD RSP, R27    // Move SP to R27 (temporary register) since SP cannot be stored directly in str instructions.
+	MOVD R27, 24(R0) // Store R27 into [RO, #ExecutionContextOffsets.OriginalFramePointer]
+	MOVD R30, 32(R0) // Store R30 into [R0, #ExecutionContextOffsets.GoReturnAddress]
+
+	// Load the new stack pointer (which sits somewhere in Go-allocated stack) into SP.
+	MOVD R19, RSP
+	JMP  (R20)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
@ -0,0 +1,230 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// CompileEntryPreamble implements backend.Machine. This assumes `entrypoint` function (in abi_go_entry_arm64.s) passes:
+//
+//  1. First (execution context ptr) and Second arguments are already passed in x0, and x1.
+//  2. param/result slice ptr in x19; the pointer to []uint64{} which is used to pass arguments and accept return values.
+//  3. Go-allocated stack slice ptr in x26.
+//  4. Function executable in x24.
+//
+// also SP and FP are correct Go-runtime-based values, and LR is the return address to the Go-side caller.
+func (m *machine) CompileEntryPreamble(signature *ssa.Signature) []byte {
+	root := m.constructEntryPreamble(signature)
+	m.encode(root)
+	return m.compiler.Buf()
+}
+
+var (
+	executionContextPtrReg = x0VReg
+	// callee-saved regs so that they can be used in the prologue and epilogue.
+	paramResultSlicePtr      = x19VReg
+	savedExecutionContextPtr = x20VReg
+	// goAllocatedStackPtr is not used in the epilogue.
+	goAllocatedStackPtr = x26VReg
+	// paramResultSliceCopied is not used in the epilogue.
+	paramResultSliceCopied = x25VReg
+	// tmpRegVReg is not used in the epilogue.
+	functionExecutable = x24VReg
+)
+
+func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, arg *backend.ABIArg, argStartOffsetFromSP int64) *instruction {
+	typ := arg.Type
+	bits := typ.Bits()
+	isStackArg := arg.Kind == backend.ABIArgKindStack
+
+	var loadTargetReg operand
+	if !isStackArg {
+		loadTargetReg = operandNR(arg.Reg)
+	} else {
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			loadTargetReg = operandNR(x15VReg)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			loadTargetReg = operandNR(v15VReg)
+		default:
+			panic("TODO?")
+		}
+	}
+
+	var postIndexImm int64
+	if typ == ssa.TypeV128 {
+		postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
+	} else {
+		postIndexImm = 8
+	}
+	loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
+
+	instr := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32:
+		instr.asULoad(loadTargetReg, loadMode, 32)
+	case ssa.TypeI64:
+		instr.asULoad(loadTargetReg, loadMode, 64)
+	case ssa.TypeF32:
+		instr.asFpuLoad(loadTargetReg, loadMode, 32)
+	case ssa.TypeF64:
+		instr.asFpuLoad(loadTargetReg, loadMode, 64)
+	case ssa.TypeV128:
+		instr.asFpuLoad(loadTargetReg, loadMode, 128)
+	}
+	cur = linkInstr(cur, instr)
+
+	if isStackArg {
+		var storeMode addressMode
+		cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
+		toStack := m.allocateInstr()
+		toStack.asStore(loadTargetReg, storeMode, bits)
+		cur = linkInstr(cur, toStack)
+	}
+	return cur
+}
+
+func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, result *backend.ABIArg, resultStartOffsetFromSP int64) *instruction {
+	isStackArg := result.Kind == backend.ABIArgKindStack
+	typ := result.Type
+	bits := typ.Bits()
+
+	var storeTargetReg operand
+	if !isStackArg {
+		storeTargetReg = operandNR(result.Reg)
+	} else {
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			storeTargetReg = operandNR(x15VReg)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			storeTargetReg = operandNR(v15VReg)
+		default:
+			panic("TODO?")
+		}
+	}
+
+	var postIndexImm int64
+	if typ == ssa.TypeV128 {
+		postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
+	} else {
+		postIndexImm = 8
+	}
+
+	if isStackArg {
+		var loadMode addressMode
+		cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
+		toReg := m.allocateInstr()
+		switch typ {
+		case ssa.TypeI32, ssa.TypeI64:
+			toReg.asULoad(storeTargetReg, loadMode, bits)
+		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+			toReg.asFpuLoad(storeTargetReg, loadMode, bits)
+		default:
+			panic("TODO?")
+		}
+		cur = linkInstr(cur, toReg)
+	}
+
+	mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
+	instr := m.allocateInstr()
+	instr.asStore(storeTargetReg, mode, bits)
+	cur = linkInstr(cur, instr)
+	return cur
+}
+
+func (m *machine) constructEntryPreamble(sig *ssa.Signature) (root *instruction) {
+	abi := backend.FunctionABI{}
+	abi.Init(sig, intParamResultRegs, floatParamResultRegs)
+
+	root = m.allocateNop()
+
+	//// ----------------------------------- prologue ----------------------------------- ////
+
+	// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
+	// 		mov savedExecutionContextPtr, x0
+	cur := m.move64(savedExecutionContextPtr, executionContextPtrReg, root)
+
+	// Next, save the current FP, SP and LR into the wazevo.executionContext:
+	// 		str fp, [savedExecutionContextPtr, #OriginalFramePointer]
+	//      mov tmp, sp ;; sp cannot be str'ed directly.
+	// 		str sp, [savedExecutionContextPtr, #OriginalStackPointer]
+	// 		str lr, [savedExecutionContextPtr, #GoReturnAddress]
+	cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, true, cur)
+	cur = m.move64(tmpRegVReg, spVReg, cur)
+	cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, true, cur)
+	cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, true, cur)
+
+	// Then, move the Go-allocated stack pointer to SP:
+	// 		mov sp, goAllocatedStackPtr
+	cur = m.move64(spVReg, goAllocatedStackPtr, cur)
+
+	prReg := paramResultSlicePtr
+	if len(abi.Args) > 2 && len(abi.Rets) > 0 {
+		// paramResultSlicePtr is modified during the execution of goEntryPreamblePassArg,
+		// so copy it to another reg.
+		cur = m.move64(paramResultSliceCopied, paramResultSlicePtr, cur)
+		prReg = paramResultSliceCopied
+	}
+
+	stackSlotSize := int64(abi.AlignedArgResultStackSlotSize())
+	for i := range abi.Args {
+		if i < 2 {
+			// module context ptr and execution context ptr are passed in x0 and x1 by the Go assembly function.
+			continue
+		}
+		arg := &abi.Args[i]
+		cur = m.goEntryPreamblePassArg(cur, prReg, arg, -stackSlotSize)
+	}
+
+	// Call the real function.
+	bl := m.allocateInstr()
+	bl.asCallIndirect(functionExecutable, &abi)
+	cur = linkInstr(cur, bl)
+
+	///// ----------------------------------- epilogue ----------------------------------- /////
+
+	// Store the register results into paramResultSlicePtr.
+	for i := range abi.Rets {
+		cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, &abi.Rets[i], abi.ArgStackSize-stackSlotSize)
+	}
+
+	// Finally, restore the FP, SP and LR, and return to the Go code.
+	// 		ldr fp, [savedExecutionContextPtr, #OriginalFramePointer]
+	// 		ldr tmp, [savedExecutionContextPtr, #OriginalStackPointer]
+	//      mov sp, tmp ;; sp cannot be str'ed directly.
+	// 		ldr lr, [savedExecutionContextPtr, #GoReturnAddress]
+	// 		ret ;; --> return to the Go code
+	cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, false, cur)
+	cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, false, cur)
+	cur = m.move64(spVReg, tmpRegVReg, cur)
+	cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, false, cur)
+	retInst := m.allocateInstr()
+	retInst.asRet()
+	linkInstr(cur, retInst)
+	return
+}
+
+func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction {
+	instr := m.allocateInstr()
+	instr.asMove64(dst, src)
+	return linkInstr(prev, instr)
+}
+
+func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
+	instr := m.allocateInstr()
+	mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
+	if store {
+		instr.asStore(operandNR(d), mode, 64)
+	} else {
+		instr.asULoad(operandNR(d), mode, 64)
+	}
+	return linkInstr(prev, instr)
+}
+
+func linkInstr(prev, next *instruction) *instruction {
+	prev.next = next
+	next.prev = prev
+	return next
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
@ -0,0 +1,428 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+var calleeSavedRegistersSorted = []regalloc.VReg{
+	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg,
+	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
+}
+
+// CompileGoFunctionTrampoline implements backend.Machine.
+func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
+	exct := m.executableContext
+	argBegin := 1 // Skips exec context by default.
+	if needModuleContextPtr {
+		argBegin++
+	}
+
+	abi := &backend.FunctionABI{}
+	abi.Init(sig, intParamResultRegs, floatParamResultRegs)
+	m.currentABI = abi
+
+	cur := m.allocateInstr()
+	cur.asNop0()
+	exct.RootInstr = cur
+
+	// Execution context is always the first argument.
+	execCtrPtr := x0VReg
+
+	// In the following, we create the following stack layout:
+	//
+	//                   (high address)
+	//     SP ------> +-----------------+  <----+
+	//                |     .......     |       |
+	//                |      ret Y      |       |
+	//                |     .......     |       |
+	//                |      ret 0      |       |
+	//                |      arg X      |       |  size_of_arg_ret
+	//                |     .......     |       |
+	//                |      arg 1      |       |
+	//                |      arg 0      |  <----+ <-------- originalArg0Reg
+	//                | size_of_arg_ret |
+	//                |  ReturnAddress  |
+	//                +-----------------+ <----+
+	//                |      xxxx       |      |  ;; might be padded to make it 16-byte aligned.
+	//           +--->|  arg[N]/ret[M]  |      |
+	//  sliceSize|    |   ............  |      | goCallStackSize
+	//           |    |  arg[1]/ret[1]  |      |
+	//           +--->|  arg[0]/ret[0]  | <----+ <-------- arg0ret0AddrReg
+	//                |    sliceSize    |
+	//                |   frame_size    |
+	//                +-----------------+
+	//                   (low address)
+	//
+	// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
+	// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
+	// the arguments/return values.
+
+	// First of all, to update the SP, and create "ReturnAddress + size_of_arg_ret".
+	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
+
+	const frameInfoSize = 16 // == frame_size + sliceSize.
+
+	// Next, we should allocate the stack for the Go function call if necessary.
+	goCallStackSize, sliceSizeInBytes := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
+	cur = m.insertStackBoundsCheck(goCallStackSize+frameInfoSize, cur)
+
+	originalArg0Reg := x17VReg // Caller save, so we can use it for whatever we want.
+	if m.currentABI.AlignedArgResultStackSlotSize() > 0 {
+		// At this point, SP points to `ReturnAddress`, so add 16 to get the original arg 0 slot.
+		cur = m.addsAddOrSubStackPointer(cur, originalArg0Reg, frameInfoSize, true)
+	}
+
+	// Save the callee saved registers.
+	cur = m.saveRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
+
+	if needModuleContextPtr {
+		offset := wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.I64()
+		if !offsetFitsInAddressModeKindRegUnsignedImm12(64, offset) {
+			panic("BUG: too large or un-aligned offset for goFunctionCallCalleeModuleContextOpaque in execution context")
+		}
+
+		// Module context is always the second argument.
+		moduleCtrPtr := x1VReg
+		store := m.allocateInstr()
+		amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
+		store.asStore(operandNR(moduleCtrPtr), amode, 64)
+		cur = linkInstr(cur, store)
+	}
+
+	// Advances the stack pointer.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, goCallStackSize, false)
+
+	// Copy the pointer to x15VReg.
+	arg0ret0AddrReg := x15VReg // Caller save, so we can use it for whatever we want.
+	copySp := m.allocateInstr()
+	copySp.asMove64(arg0ret0AddrReg, spVReg)
+	cur = linkInstr(cur, copySp)
+
+	// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
+	for i := range abi.Args[argBegin:] {
+		arg := &abi.Args[argBegin+i]
+		store := m.allocateInstr()
+		var v regalloc.VReg
+		if arg.Kind == backend.ABIArgKindReg {
+			v = arg.Reg
+		} else {
+			cur, v = m.goFunctionCallLoadStackArg(cur, originalArg0Reg, arg,
+				// Caller save, so we can use it for whatever we want.
+				x11VReg, v11VReg)
+		}
+
+		var sizeInBits byte
+		if arg.Type == ssa.TypeV128 {
+			sizeInBits = 128
+		} else {
+			sizeInBits = 64
+		}
+		store.asStore(operandNR(v),
+			addressMode{
+				kind: addressModeKindPostIndex,
+				rn:   arg0ret0AddrReg, imm: int64(sizeInBits / 8),
+			}, sizeInBits)
+		cur = linkInstr(cur, store)
+	}
+
+	// Finally, now that we've advanced SP to arg[0]/ret[0], we allocate `frame_size + sliceSize`.
+	var frameSizeReg, sliceSizeReg regalloc.VReg
+	if goCallStackSize > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, goCallStackSize)
+		frameSizeReg = tmpRegVReg
+		cur = m.lowerConstantI64AndInsert(cur, x16VReg, sliceSizeInBytes/8)
+		sliceSizeReg = x16VReg
+	} else {
+		frameSizeReg = xzrVReg
+		sliceSizeReg = xzrVReg
+	}
+	_amode := addressModePreOrPostIndex(spVReg, -16, true)
+	storeP := m.allocateInstr()
+	storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
+	cur = linkInstr(cur, storeP)
+
+	// Set the exit status on the execution context.
+	cur = m.setExitCode(cur, x0VReg, exitCode)
+
+	// Save the current stack pointer.
+	cur = m.saveCurrentStackPointer(cur, x0VReg)
+
+	// Exit the execution.
+	cur = m.storeReturnAddressAndExit(cur)
+
+	// After the call, we need to restore the callee saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
+
+	// Get the pointer to the arg[0]/ret[0]: We need to skip `frame_size + sliceSize`.
+	if len(abi.Rets) > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, arg0ret0AddrReg, frameInfoSize, true)
+	}
+
+	// Advances the SP so that it points to `ReturnAddress`.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
+	ldr := m.allocateInstr()
+	// And load the return address.
+	ldr.asULoad(operandNR(lrVReg),
+		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	cur = linkInstr(cur, ldr)
+
+	originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
+	if m.currentABI.RetStackSize > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, originalRet0Reg, m.currentABI.ArgStackSize, true)
+	}
+
+	// Make the SP point to the original address (above the result slot).
+	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	for i := range abi.Rets {
+		r := &abi.Rets[i]
+		if r.Kind == backend.ABIArgKindReg {
+			loadIntoReg := m.allocateInstr()
+			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			switch r.Type {
+			case ssa.TypeI32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
+			case ssa.TypeI64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
+			case ssa.TypeF32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
+			case ssa.TypeF64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
+			case ssa.TypeV128:
+				mode.imm = 16
+				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
+			default:
+				panic("TODO")
+			}
+			cur = linkInstr(cur, loadIntoReg)
+		} else {
+			// First we need to load the value to a temporary just like ^^.
+			intTmp, floatTmp := x11VReg, v11VReg
+			loadIntoTmpReg := m.allocateInstr()
+			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			var resultReg regalloc.VReg
+			switch r.Type {
+			case ssa.TypeI32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
+				resultReg = intTmp
+			case ssa.TypeI64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
+				resultReg = intTmp
+			case ssa.TypeF32:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
+				resultReg = floatTmp
+			case ssa.TypeF64:
+				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
+				resultReg = floatTmp
+			case ssa.TypeV128:
+				mode.imm = 16
+				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
+				resultReg = floatTmp
+			default:
+				panic("TODO")
+			}
+			cur = linkInstr(cur, loadIntoTmpReg)
+			cur = m.goFunctionCallStoreStackResult(cur, originalRet0Reg, r, resultReg)
+		}
+	}
+
+	ret := m.allocateInstr()
+	ret.asRet()
+	linkInstr(cur, ret)
+
+	m.encode(m.executableContext.RootInstr)
+	return m.compiler.Buf()
+}
+
+func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		store := m.allocateInstr()
+		var sizeInBits byte
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			sizeInBits = 64
+		case regalloc.RegTypeFloat:
+			sizeInBits = 128
+		}
+		store.asStore(operandNR(v),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: offset,
+			}, sizeInBits)
+		store.prev = cur
+		cur.next = store
+		cur = store
+		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally store regs at the offset of multiple of 16.
+	}
+	return cur
+}
+
+func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
+	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
+	for _, v := range regs {
+		load := m.allocateInstr()
+		var as func(dst operand, amode addressMode, sizeInBits byte)
+		var sizeInBits byte
+		switch v.RegType() {
+		case regalloc.RegTypeInt:
+			as = load.asULoad
+			sizeInBits = 64
+		case regalloc.RegTypeFloat:
+			as = load.asFpuLoad
+			sizeInBits = 128
+		}
+		as(operandNR(v),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: offset,
+			}, sizeInBits)
+		cur = linkInstr(cur, load)
+		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
+	}
+	return cur
+}
+
+func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	m.lowerConstantI64(dst, v)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur
+}
+
+func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction {
+	exct := m.executableContext
+	exct.PendingInstructions = exct.PendingInstructions[:0]
+	m.lowerConstantI32(dst, v)
+	for _, instr := range exct.PendingInstructions {
+		cur = linkInstr(cur, instr)
+	}
+	return cur
+}
+
+func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode wazevoapi.ExitCode) *instruction {
+	constReg := x17VReg // caller-saved, so we can use it.
+	cur = m.lowerConstantI32AndInsert(cur, constReg, int32(exitCode))
+
+	// Set the exit status on the execution context.
+	setExistStatus := m.allocateInstr()
+	setExistStatus.asStore(operandNR(constReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+		}, 32)
+	cur = linkInstr(cur, setExistStatus)
+	return cur
+}
+
+func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
+	// Read the return address into tmp, and store it in the execution context.
+	adr := m.allocateInstr()
+	adr.asAdr(tmpRegVReg, exitSequenceSize+8)
+	cur = linkInstr(cur, adr)
+
+	storeReturnAddr := m.allocateInstr()
+	storeReturnAddr.asStore(operandNR(tmpRegVReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+		}, 64)
+	cur = linkInstr(cur, storeReturnAddr)
+
+	// Exit the execution.
+	trapSeq := m.allocateInstr()
+	trapSeq.asExitSequence(x0VReg)
+	cur = linkInstr(cur, trapSeq)
+	return cur
+}
+
+func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VReg) *instruction {
+	// Save the current stack pointer:
+	// 	mov tmp, sp,
+	// 	str tmp, [exec_ctx, #stackPointerBeforeGoCall]
+	movSp := m.allocateInstr()
+	movSp.asMove64(tmpRegVReg, spVReg)
+	cur = linkInstr(cur, movSp)
+
+	strSp := m.allocateInstr()
+	strSp.asStore(operandNR(tmpRegVReg),
+		addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+		}, 64)
+	cur = linkInstr(cur, strSp)
+	return cur
+}
+
+func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
+	load := m.allocateInstr()
+	var result regalloc.VReg
+	mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
+	switch arg.Type {
+	case ssa.TypeI32:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asULoad(operandNR(intVReg), mode, 32)
+		result = intVReg
+	case ssa.TypeI64:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asULoad(operandNR(intVReg), mode, 64)
+		result = intVReg
+	case ssa.TypeF32:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asFpuLoad(operandNR(floatVReg), mode, 32)
+		result = floatVReg
+	case ssa.TypeF64:
+		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
+		load.asFpuLoad(operandNR(floatVReg), mode, 64)
+		result = floatVReg
+	case ssa.TypeV128:
+		mode.imm = 16
+		load.asFpuLoad(operandNR(floatVReg), mode, 128)
+		result = floatVReg
+	default:
+		panic("TODO")
+	}
+
+	cur = linkInstr(cur, load)
+	return cur, result
+}
+
+func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
+	store := m.allocateInstr()
+	mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
+	var sizeInBits byte
+	switch result.Type {
+	case ssa.TypeI32, ssa.TypeF32:
+		mode.imm = 8
+		sizeInBits = 32
+	case ssa.TypeI64, ssa.TypeF64:
+		mode.imm = 8
+		sizeInBits = 64
+	case ssa.TypeV128:
+		mode.imm = 16
+		sizeInBits = 128
+	default:
+		panic("TODO")
+	}
+	store.asStore(operandNR(resultVReg), mode, sizeInBits)
+	return linkInstr(cur, store)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/cond.go
@ -0,0 +1,215 @@
+package arm64
+
+import (
+	"strconv"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	cond     uint64
+	condKind byte
+)
+
+const (
+	// condKindRegisterZero represents a condition which checks if the register is zero.
+	// This indicates that the instruction must be encoded as CBZ:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
+	condKindRegisterZero condKind = iota
+	// condKindRegisterNotZero indicates that the instruction must be encoded as CBNZ:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
+	condKindRegisterNotZero
+	// condKindCondFlagSet indicates that the instruction must be encoded as B.cond:
+	// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
+	condKindCondFlagSet
+)
+
+// kind returns the kind of condition which is stored in the first two bits.
+func (c cond) kind() condKind {
+	return condKind(c & 0b11)
+}
+
+func (c cond) asUint64() uint64 {
+	return uint64(c)
+}
+
+// register returns the register for register conditions.
+// This panics if the condition is not a register condition (condKindRegisterZero or condKindRegisterNotZero).
+func (c cond) register() regalloc.VReg {
+	if c.kind() != condKindRegisterZero && c.kind() != condKindRegisterNotZero {
+		panic("condition is not a register")
+	}
+	return regalloc.VReg(c >> 2)
+}
+
+func registerAsRegZeroCond(r regalloc.VReg) cond {
+	return cond(r)<<2 | cond(condKindRegisterZero)
+}
+
+func registerAsRegNotZeroCond(r regalloc.VReg) cond {
+	return cond(r)<<2 | cond(condKindRegisterNotZero)
+}
+
+func (c cond) flag() condFlag {
+	if c.kind() != condKindCondFlagSet {
+		panic("condition is not a flag")
+	}
+	return condFlag(c >> 2)
+}
+
+func (c condFlag) asCond() cond {
+	return cond(c)<<2 | cond(condKindCondFlagSet)
+}
+
+// condFlag represents a condition flag for conditional branches.
+// The value matches the encoding of condition flags in the ARM64 instruction set.
+// https://developer.arm.com/documentation/den0024/a/The-A64-instruction-set/Data-processing-instructions/Conditional-instructions
+type condFlag uint8
+
+const (
+	eq condFlag = iota // eq represents "equal"
+	ne                 // ne represents "not equal"
+	hs                 // hs represents "higher or same"
+	lo                 // lo represents "lower"
+	mi                 // mi represents "minus or negative result"
+	pl                 // pl represents "plus or positive result"
+	vs                 // vs represents "overflow set"
+	vc                 // vc represents "overflow clear"
+	hi                 // hi represents "higher"
+	ls                 // ls represents "lower or same"
+	ge                 // ge represents "greater or equal"
+	lt                 // lt represents "less than"
+	gt                 // gt represents "greater than"
+	le                 // le represents "less than or equal"
+	al                 // al represents "always"
+	nv                 // nv represents "never"
+)
+
+// invert returns the inverted condition.
+func (c condFlag) invert() condFlag {
+	switch c {
+	case eq:
+		return ne
+	case ne:
+		return eq
+	case hs:
+		return lo
+	case lo:
+		return hs
+	case mi:
+		return pl
+	case pl:
+		return mi
+	case vs:
+		return vc
+	case vc:
+		return vs
+	case hi:
+		return ls
+	case ls:
+		return hi
+	case ge:
+		return lt
+	case lt:
+		return ge
+	case gt:
+		return le
+	case le:
+		return gt
+	case al:
+		return nv
+	case nv:
+		return al
+	default:
+		panic(c)
+	}
+}
+
+// String implements fmt.Stringer.
+func (c condFlag) String() string {
+	switch c {
+	case eq:
+		return "eq"
+	case ne:
+		return "ne"
+	case hs:
+		return "hs"
+	case lo:
+		return "lo"
+	case mi:
+		return "mi"
+	case pl:
+		return "pl"
+	case vs:
+		return "vs"
+	case vc:
+		return "vc"
+	case hi:
+		return "hi"
+	case ls:
+		return "ls"
+	case ge:
+		return "ge"
+	case lt:
+		return "lt"
+	case gt:
+		return "gt"
+	case le:
+		return "le"
+	case al:
+		return "al"
+	case nv:
+		return "nv"
+	default:
+		panic(strconv.Itoa(int(c)))
+	}
+}
+
+// condFlagFromSSAIntegerCmpCond returns the condition flag for the given ssa.IntegerCmpCond.
+func condFlagFromSSAIntegerCmpCond(c ssa.IntegerCmpCond) condFlag {
+	switch c {
+	case ssa.IntegerCmpCondEqual:
+		return eq
+	case ssa.IntegerCmpCondNotEqual:
+		return ne
+	case ssa.IntegerCmpCondSignedLessThan:
+		return lt
+	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
+		return ge
+	case ssa.IntegerCmpCondSignedGreaterThan:
+		return gt
+	case ssa.IntegerCmpCondSignedLessThanOrEqual:
+		return le
+	case ssa.IntegerCmpCondUnsignedLessThan:
+		return lo
+	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
+		return hs
+	case ssa.IntegerCmpCondUnsignedGreaterThan:
+		return hi
+	case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
+		return ls
+	default:
+		panic(c)
+	}
+}
+
+// condFlagFromSSAFloatCmpCond returns the condition flag for the given ssa.FloatCmpCond.
+func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag {
+	switch c {
+	case ssa.FloatCmpCondEqual:
+		return eq
+	case ssa.FloatCmpCondNotEqual:
+		return ne
+	case ssa.FloatCmpCondLessThan:
+		return mi
+	case ssa.FloatCmpCondLessThanOrEqual:
+		return ls
+	case ssa.FloatCmpCondGreaterThan:
+		return gt
+	case ssa.FloatCmpCondGreaterThanOrEqual:
+		return ge
+	default:
+		panic(c)
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
@ -0,0 +1,301 @@
+package arm64
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
+func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+
+	vr = m.compiler.AllocateVReg(valType)
+	v := instr.ConstantVal()
+	m.insertLoadConstant(v, valType, vr)
+	return
+}
+
+// InsertLoadConstantBlockArg implements backend.Machine.
+func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
+	val := instr.Return()
+	valType := val.Type()
+	v := instr.ConstantVal()
+	load := m.allocateInstr()
+	load.asLoadConstBlockArg(v, valType, vr)
+	m.insert(load)
+}
+
+func (m *machine) lowerLoadConstantBlockArgAfterRegAlloc(i *instruction) {
+	v, typ, dst := i.loadConstBlockArgData()
+	m.insertLoadConstant(v, typ, dst)
+}
+
+func (m *machine) insertLoadConstant(v uint64, valType ssa.Type, vr regalloc.VReg) {
+	if valType.Bits() < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
+		v = v & ((1 << valType.Bits()) - 1)
+	}
+
+	switch valType {
+	case ssa.TypeF32:
+		loadF := m.allocateInstr()
+		loadF.asLoadFpuConst32(vr, v)
+		m.insert(loadF)
+	case ssa.TypeF64:
+		loadF := m.allocateInstr()
+		loadF.asLoadFpuConst64(vr, v)
+		m.insert(loadF)
+	case ssa.TypeI32:
+		if v == 0 {
+			m.InsertMove(vr, xzrVReg, ssa.TypeI32)
+		} else {
+			m.lowerConstantI32(vr, int32(v))
+		}
+	case ssa.TypeI64:
+		if v == 0 {
+			m.InsertMove(vr, xzrVReg, ssa.TypeI64)
+		} else {
+			m.lowerConstantI64(vr, int64(v))
+		}
+	default:
+		panic("TODO")
+	}
+}
+
+// The following logics are based on the old asm/arm64 package.
+// https://github.com/tetratelabs/wazero/blob/39f2ff23a6d609e10c82b9cc0b981f6de5b87a9c/internal/asm/arm64/impl.go
+
+func (m *machine) lowerConstantI32(dst regalloc.VReg, c int32) {
+	// Following the logic here:
+	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
+	ic := int64(uint32(c))
+	if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
+		if isBitMaskImmediate(uint64(c), false) {
+			m.lowerConstViaBitMaskImmediate(uint64(uint32(c)), dst, false)
+			return
+		}
+	}
+
+	if t := const16bitAligned(int64(uint32(c))); t >= 0 {
+		// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
+		// We could load it into temporary with movk.
+		m.insertMOVZ(dst, uint64(uint32(c)>>(16*t)), t, false)
+	} else if t := const16bitAligned(int64(^c)); t >= 0 {
+		// Also, if the inverse of the const can fit within 16-bit range, do the same ^^.
+		m.insertMOVN(dst, uint64(^c>>(16*t)), t, false)
+	} else if isBitMaskImmediate(uint64(uint32(c)), false) {
+		m.lowerConstViaBitMaskImmediate(uint64(c), dst, false)
+	} else {
+		// Otherwise, we use MOVZ and MOVK to load it.
+		c16 := uint16(c)
+		m.insertMOVZ(dst, uint64(c16), 0, false)
+		c16 = uint16(uint32(c) >> 16)
+		m.insertMOVK(dst, uint64(c16), 1, false)
+	}
+}
+
+func (m *machine) lowerConstantI64(dst regalloc.VReg, c int64) {
+	// Following the logic here:
+	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
+	if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
+		if isBitMaskImmediate(uint64(c), true) {
+			m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
+			return
+		}
+	}
+
+	if t := const16bitAligned(c); t >= 0 {
+		// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
+		// We could load it into temporary with movk.
+		m.insertMOVZ(dst, uint64(c)>>(16*t), t, true)
+	} else if t := const16bitAligned(^c); t >= 0 {
+		// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
+		m.insertMOVN(dst, uint64(^c)>>(16*t), t, true)
+	} else if isBitMaskImmediate(uint64(c), true) {
+		m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
+	} else {
+		m.load64bitConst(c, dst)
+	}
+}
+
+func (m *machine) lowerConstViaBitMaskImmediate(c uint64, dst regalloc.VReg, b64 bool) {
+	instr := m.allocateInstr()
+	instr.asALUBitmaskImm(aluOpOrr, dst, xzrVReg, c, b64)
+	m.insert(instr)
+}
+
+// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
+//
+//	Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
+//	Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
+//
+// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
+func isBitMaskImmediate(x uint64, _64 bool) bool {
+	// All zeros and ones are not "bitmask immediate" by definition.
+	if x == 0 || (_64 && x == 0xffff_ffff_ffff_ffff) || (!_64 && x == 0xffff_ffff) {
+		return false
+	}
+
+	switch {
+	case x != x>>32|x<<32:
+		// e = 64
+	case x != x>>16|x<<48:
+		// e = 32 (x == x>>32|x<<32).
+		// e.g. 0x00ff_ff00_00ff_ff00
+		x = uint64(int32(x))
+	case x != x>>8|x<<56:
+		// e = 16 (x == x>>16|x<<48).
+		// e.g. 0x00ff_00ff_00ff_00ff
+		x = uint64(int16(x))
+	case x != x>>4|x<<60:
+		// e = 8 (x == x>>8|x<<56).
+		// e.g. 0x0f0f_0f0f_0f0f_0f0f
+		x = uint64(int8(x))
+	default:
+		// e = 4 or 2.
+		return true
+	}
+	return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
+}
+
+// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
+// For example: 0b1110 -> true, 0b1010 -> false
+func sequenceOfSetbits(x uint64) bool {
+	y := getLowestBit(x)
+	// If x is a sequence of set bit, this should results in the number
+	// with only one set bit (i.e. power of two).
+	y += x
+	return (y-1)&y == 0
+}
+
+func getLowestBit(x uint64) uint64 {
+	return x & (^x + 1)
+}
+
+// const16bitAligned check if the value is on the 16-bit alignment.
+// If so, returns the shift num divided by 16, and otherwise -1.
+func const16bitAligned(v int64) (ret int) {
+	ret = -1
+	for s := 0; s < 64; s += 16 {
+		if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
+			ret = s / 16
+			break
+		}
+	}
+	return
+}
+
+// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
+// consts as in the Go assembler.
+//
+// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
+func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
+	var bits [4]uint64
+	var zeros, negs int
+	for i := 0; i < 4; i++ {
+		bits[i] = uint64(c) >> uint(i*16) & 0xffff
+		if v := bits[i]; v == 0 {
+			zeros++
+		} else if v == 0xffff {
+			negs++
+		}
+	}
+
+	if zeros == 3 {
+		// one MOVZ instruction.
+		for i, v := range bits {
+			if v != 0 {
+				m.insertMOVZ(dst, v, i, true)
+			}
+		}
+	} else if negs == 3 {
+		// one MOVN instruction.
+		for i, v := range bits {
+			if v != 0xffff {
+				v = ^v
+				m.insertMOVN(dst, v, i, true)
+			}
+		}
+	} else if zeros == 2 {
+		// one MOVZ then one OVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if negs == 2 {
+		// one MOVN then one or two MOVK.
+		var movn bool
+		for i, v := range bits { // Emit MOVN.
+			if !movn && v != 0xffff {
+				v = ^v
+				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
+				m.insertMOVN(dst, v, i, true)
+				movn = true
+			} else if v != 0xffff {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if zeros == 1 {
+		// one MOVZ then two MOVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else if negs == 1 {
+		// one MOVN then two MOVK.
+		var movn bool
+		for i, v := range bits { // Emit MOVN.
+			if !movn && v != 0xffff {
+				v = ^v
+				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
+				m.insertMOVN(dst, v, i, true)
+				movn = true
+			} else if v != 0xffff {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+
+	} else {
+		// one MOVZ then up to three MOVK.
+		var movz bool
+		for i, v := range bits {
+			if !movz && v != 0 { // MOVZ.
+				m.insertMOVZ(dst, v, i, true)
+				movz = true
+			} else if v != 0 {
+				m.insertMOVK(dst, v, i, true)
+			}
+		}
+	}
+}
+
+func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVZ(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
+
+func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVK(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
+
+func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
+	instr := m.allocateInstr()
+	instr.asMOVN(dst, v, uint64(shift), dst64)
+	m.insert(instr)
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_instr_operands.go
@ -0,0 +1,350 @@
+package arm64
+
+// This file contains the logic to "find and determine operands" for instructions.
+// In order to finalize the form of an operand, we might end up merging/eliminating
+// the source instructions into an operand whenever possible.
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+type (
+	// operand represents an operand of an instruction whose type is determined by the kind.
+	operand struct {
+		kind        operandKind
+		data, data2 uint64
+	}
+	operandKind byte
+)
+
+// Here's the list of operand kinds. We use the abbreviation of the kind name not only for these consts,
+// but also names of functions which return the operand of the kind.
+const (
+	// operandKindNR represents "NormalRegister" (NR). This is literally the register without any special operation unlike others.
+	operandKindNR operandKind = iota
+	// operandKindSR represents "Shifted Register" (SR). This is a register which is shifted by a constant.
+	// Some of the arm64 instructions can take this kind of operand.
+	operandKindSR
+	// operandKindER represents "Extended Register (ER). This is a register which is sign/zero-extended to a larger size.
+	// Some of the arm64 instructions can take this kind of operand.
+	operandKindER
+	// operandKindImm12 represents "Immediate 12" (Imm12). This is a 12-bit immediate value which can be either shifted by 12 or not.
+	// See asImm12 function for detail.
+	operandKindImm12
+	// operandKindShiftImm represents "Shifted Immediate" (ShiftImm) used by shift operations.
+	operandKindShiftImm
+)
+
+// String implements fmt.Stringer for debugging.
+func (o operand) format(size byte) string {
+	switch o.kind {
+	case operandKindNR:
+		return formatVRegSized(o.nr(), size)
+	case operandKindSR:
+		r, amt, sop := o.sr()
+		return fmt.Sprintf("%s, %s #%d", formatVRegSized(r, size), sop, amt)
+	case operandKindER:
+		r, eop, _ := o.er()
+		return fmt.Sprintf("%s %s", formatVRegSized(r, size), eop)
+	case operandKindImm12:
+		imm12, shiftBit := o.imm12()
+		if shiftBit == 1 {
+			return fmt.Sprintf("#%#x", uint64(imm12)<<12)
+		} else {
+			return fmt.Sprintf("#%#x", imm12)
+		}
+	default:
+		panic(fmt.Sprintf("unknown operand kind: %d", o.kind))
+	}
+}
+
+// operandNR encodes the given VReg as an operand of operandKindNR.
+func operandNR(r regalloc.VReg) operand {
+	return operand{kind: operandKindNR, data: uint64(r)}
+}
+
+// nr decodes the underlying VReg assuming the operand is of operandKindNR.
+func (o operand) nr() regalloc.VReg {
+	return regalloc.VReg(o.data)
+}
+
+// operandER encodes the given VReg as an operand of operandKindER.
+func operandER(r regalloc.VReg, eop extendOp, to byte) operand {
+	if to < 32 {
+		panic("TODO?BUG?: when we need to extend to less than 32 bits?")
+	}
+	return operand{kind: operandKindER, data: uint64(r), data2: uint64(eop)<<32 | uint64(to)}
+}
+
+// er decodes the underlying VReg, extend operation, and the target size assuming the operand is of operandKindER.
+func (o operand) er() (r regalloc.VReg, eop extendOp, to byte) {
+	return regalloc.VReg(o.data), extendOp(o.data2>>32) & 0xff, byte(o.data2 & 0xff)
+}
+
+// operandSR encodes the given VReg as an operand of operandKindSR.
+func operandSR(r regalloc.VReg, amt byte, sop shiftOp) operand {
+	return operand{kind: operandKindSR, data: uint64(r), data2: uint64(amt)<<32 | uint64(sop)}
+}
+
+// sr decodes the underlying VReg, shift amount, and shift operation assuming the operand is of operandKindSR.
+func (o operand) sr() (r regalloc.VReg, amt byte, sop shiftOp) {
+	return regalloc.VReg(o.data), byte(o.data2>>32) & 0xff, shiftOp(o.data2) & 0xff
+}
+
+// operandImm12 encodes the given imm12 as an operand of operandKindImm12.
+func operandImm12(imm12 uint16, shiftBit byte) operand {
+	return operand{kind: operandKindImm12, data: uint64(imm12) | uint64(shiftBit)<<32}
+}
+
+// imm12 decodes the underlying imm12 data assuming the operand is of operandKindImm12.
+func (o operand) imm12() (v uint16, shiftBit byte) {
+	return uint16(o.data), byte(o.data >> 32)
+}
+
+// operandShiftImm encodes the given amount as an operand of operandKindShiftImm.
+func operandShiftImm(amount byte) operand {
+	return operand{kind: operandKindShiftImm, data: uint64(amount)}
+}
+
+// shiftImm decodes the underlying shift amount data assuming the operand is of operandKindShiftImm.
+func (o operand) shiftImm() byte {
+	return byte(o.data)
+}
+
+// reg returns the register of the operand if applicable.
+func (o operand) reg() regalloc.VReg {
+	switch o.kind {
+	case operandKindNR:
+		return o.nr()
+	case operandKindSR:
+		r, _, _ := o.sr()
+		return r
+	case operandKindER:
+		r, _, _ := o.er()
+		return r
+	case operandKindImm12:
+		// Does not have a register.
+	case operandKindShiftImm:
+		// Does not have a register.
+	default:
+		panic(o.kind)
+	}
+	return regalloc.VRegInvalid
+}
+
+func (o operand) realReg() regalloc.RealReg {
+	return o.nr().RealReg()
+}
+
+func (o operand) assignReg(v regalloc.VReg) operand {
+	switch o.kind {
+	case operandKindNR:
+		return operandNR(v)
+	case operandKindSR:
+		_, amt, sop := o.sr()
+		return operandSR(v, amt, sop)
+	case operandKindER:
+		_, eop, to := o.er()
+		return operandER(v, eop, to)
+	case operandKindImm12:
+		// Does not have a register.
+	case operandKindShiftImm:
+		// Does not have a register.
+	}
+	panic(o.kind)
+}
+
+// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+// If the operand can be expressed as operandKindImm12, `mode` is ignored.
+func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Opcode() == ssa.OpcodeIconst {
+		if imm12Op, ok := asImm12Operand(instr.ConstantVal()); ok {
+			instr.MarkLowered()
+			return imm12Op
+		}
+	}
+	return m.getOperand_ER_SR_NR(def, mode)
+}
+
+// getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value.
+// If the immediate value is negated, the second return value is true, otherwise always false.
+func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg), false
+	}
+
+	instr := def.Instr
+	if instr.Opcode() == ssa.OpcodeIconst {
+		c := instr.ConstantVal()
+		if imm12Op, ok := asImm12Operand(c); ok {
+			instr.MarkLowered()
+			return imm12Op, false
+		}
+
+		signExtended := int64(c)
+		if def.SSAValue().Type().Bits() == 32 {
+			signExtended = (signExtended << 32) >> 32
+		}
+		negatedWithoutSign := -signExtended
+		if imm12Op, ok := asImm12Operand(uint64(negatedWithoutSign)); ok {
+			instr.MarkLowered()
+			return imm12Op, true
+		}
+	}
+	return m.getOperand_ER_SR_NR(def, mode), false
+}
+
+// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) {
+		extInstr := def.Instr
+
+		signed := extInstr.Opcode() == ssa.OpcodeSExtend
+		innerExtFromBits, innerExtToBits := extInstr.ExtendFromToBits()
+		modeBits, modeSigned := mode.bits(), mode.signed()
+		if mode == extModeNone || innerExtToBits == modeBits {
+			eop := extendOpFrom(signed, innerExtFromBits)
+			extArg := m.getOperand_NR(m.compiler.ValueDefinition(extInstr.Arg()), extModeNone)
+			op = operandER(extArg.nr(), eop, innerExtToBits)
+			extInstr.MarkLowered()
+			return
+		}
+
+		if innerExtToBits > modeBits {
+			panic("BUG?TODO?: need the results of inner extension to be larger than the mode")
+		}
+
+		switch {
+		case (!signed && !modeSigned) || (signed && modeSigned):
+			// Two sign/zero extensions are equivalent to one sign/zero extension for the larger size.
+			eop := extendOpFrom(modeSigned, innerExtFromBits)
+			op = operandER(m.compiler.VRegOf(extInstr.Arg()), eop, modeBits)
+			extInstr.MarkLowered()
+		case (signed && !modeSigned) || (!signed && modeSigned):
+			// We need to {sign, zero}-extend the result of the {zero,sign} extension.
+			eop := extendOpFrom(modeSigned, innerExtToBits)
+			op = operandER(m.compiler.VRegOf(extInstr.Return()), eop, modeBits)
+			// Note that we failed to merge the inner extension instruction this case.
+		}
+		return
+	}
+	return m.getOperand_SR_NR(def, mode)
+}
+
+// ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	if m.compiler.MatchInstr(def, ssa.OpcodeIshl) {
+		// Check if the shift amount is constant instruction.
+		targetVal, amountVal := def.Instr.Arg2()
+		targetVReg := m.getOperand_NR(m.compiler.ValueDefinition(targetVal), extModeNone).nr()
+		amountDef := m.compiler.ValueDefinition(amountVal)
+		if amountDef.IsFromInstr() && amountDef.Instr.Constant() {
+			// If that is the case, we can use the shifted register operand (SR).
+			c := byte(amountDef.Instr.ConstantVal()) & (targetVal.Type().Bits() - 1) // Clears the unnecessary bits.
+			def.Instr.MarkLowered()
+			amountDef.Instr.MarkLowered()
+			return operandSR(targetVReg, c, shiftOpLSL)
+		}
+	}
+	return m.getOperand_NR(def, mode)
+}
+
+// getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def).
+func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) {
+	if def.IsFromBlockParam() {
+		return operandNR(def.BlkParamVReg)
+	}
+
+	instr := def.Instr
+	if instr.Constant() {
+		amount := byte(instr.ConstantVal()) & (shiftBitWidth - 1) // Clears the unnecessary bits.
+		return operandShiftImm(amount)
+	}
+	return m.getOperand_NR(def, mode)
+}
+
+// ensureValueNR returns an operand of operandKindNR from the given value (defined by `def).
+//
+// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
+func (m *machine) getOperand_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
+	var v regalloc.VReg
+	if def.IsFromBlockParam() {
+		v = def.BlkParamVReg
+	} else {
+		instr := def.Instr
+		if instr.Constant() {
+			// We inline all the constant instructions so that we could reduce the register usage.
+			v = m.lowerConstant(instr)
+			instr.MarkLowered()
+		} else {
+			if n := def.N; n == 0 {
+				v = m.compiler.VRegOf(instr.Return())
+			} else {
+				_, rs := instr.Returns()
+				v = m.compiler.VRegOf(rs[n-1])
+			}
+		}
+	}
+
+	r := v
+	switch inBits := def.SSAValue().Type().Bits(); {
+	case mode == extModeNone:
+	case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32):
+	case inBits == 32 && mode == extModeZeroExtend64:
+		extended := m.compiler.AllocateVReg(ssa.TypeI64)
+		ext := m.allocateInstr()
+		ext.asExtend(extended, v, 32, 64, false)
+		m.insert(ext)
+		r = extended
+	case inBits == 32 && mode == extModeSignExtend64:
+		extended := m.compiler.AllocateVReg(ssa.TypeI64)
+		ext := m.allocateInstr()
+		ext.asExtend(extended, v, 32, 64, true)
+		m.insert(ext)
+		r = extended
+	case inBits == 64 && (mode == extModeZeroExtend64 || mode == extModeSignExtend64):
+	}
+	return operandNR(r)
+}
+
+func asImm12Operand(val uint64) (op operand, ok bool) {
+	v, shiftBit, ok := asImm12(val)
+	if !ok {
+		return operand{}, false
+	}
+	return operandImm12(v, shiftBit), true
+}
+
+func asImm12(val uint64) (v uint16, shiftBit byte, ok bool) {
+	const mask1, mask2 uint64 = 0xfff, 0xfff_000
+	if val&^mask1 == 0 {
+		return uint16(val), 0, true
+	} else if val&^mask2 == 0 {
+		return uint16(val >> 12), 1, true
+	} else {
+		return 0, 0, false
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
@ -0,0 +1,440 @@
+package arm64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// addressMode represents an ARM64 addressing mode.
+	//
+	// https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing
+	// TODO: use the bit-packed layout like operand struct.
+	addressMode struct {
+		kind   addressModeKind
+		rn, rm regalloc.VReg
+		extOp  extendOp
+		imm    int64
+	}
+
+	// addressModeKind represents the kind of ARM64 addressing mode.
+	addressModeKind byte
+)
+
+const (
+	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
+	// and then scaled by bits(type)/8.
+	//
+	// e.g.
+	// 	- ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1)
+	// 	- strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1)
+	// 	- ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2)
+	// 	- str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3)
+	//
+	// See the following pages:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--
+	addressModeKindRegScaledExtended addressModeKind = iota
+
+	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor.
+	addressModeKindRegScaled
+
+	// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor.
+	addressModeKindRegExtended
+
+	// addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended.
+	addressModeKindRegReg
+
+	// addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255).
+	// The immediate will be sign-extended, and be added to the base register.
+	// This is a.k.a. "unscaled" since the immediate is not scaled.
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
+	addressModeKindRegSignedImm9
+
+	// addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset.  scaled by
+	// the size of the type. In other words, the actual offset will be imm12 * bits(type)/8.
+	// See "Unsigned offset" in the following pages:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	addressModeKindRegUnsignedImm12
+
+	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
+	// After the load/store, the base register will be updated by the offset.
+	//
+	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
+	//
+	// See "Post-index" in the following pages for examples:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+	addressModeKindPostIndex
+
+	// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
+	// Before the load/store, the base register will be updated by the offset.
+	//
+	// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
+	//
+	// See "Pre-index" in the following pages for examples:
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
+	addressModeKindPreIndex
+
+	// addressModeKindArgStackSpace is used to resolve the address of the argument stack space
+	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
+	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
+	addressModeKindArgStackSpace
+
+	// addressModeKindResultStackSpace is used to resolve the address of the result stack space
+	// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
+	// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
+	addressModeKindResultStackSpace
+)
+
+func (a addressMode) format(dstSizeBits byte) (ret string) {
+	base := formatVRegSized(a.rn, 64)
+	if rn := a.rn; rn.RegType() != regalloc.RegTypeInt {
+		panic("invalid base register type: " + a.rn.RegType().String())
+	} else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 {
+		panic("BUG: likely a bug in reg alloc or reset behavior")
+	}
+
+	switch a.kind {
+	case addressModeKindRegScaledExtended:
+		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
+		ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount)
+	case addressModeKindRegScaled:
+		amount := a.sizeInBitsToShiftAmount(dstSizeBits)
+		ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount)
+	case addressModeKindRegExtended:
+		ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp)
+	case addressModeKindRegReg:
+		ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()))
+	case addressModeKindRegSignedImm9:
+		if a.imm != 0 {
+			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
+		} else {
+			ret = fmt.Sprintf("[%s]", base)
+		}
+	case addressModeKindRegUnsignedImm12:
+		if a.imm != 0 {
+			ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
+		} else {
+			ret = fmt.Sprintf("[%s]", base)
+		}
+	case addressModeKindPostIndex:
+		ret = fmt.Sprintf("[%s], #%#x", base, a.imm)
+	case addressModeKindPreIndex:
+		ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm)
+	case addressModeKindArgStackSpace:
+		ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm)
+	case addressModeKindResultStackSpace:
+		ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm)
+	}
+	return
+}
+
+func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
+	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
+		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
+	}
+	if preIndex {
+		return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
+	} else {
+		return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
+	}
+}
+
+func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
+	divisor := int64(dstSizeInBits) / 8
+	return 0 < offset && offset%divisor == 0 && offset/divisor < 4096
+}
+
+func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool {
+	return -256 <= offset && offset <= 255
+}
+
+func (a addressMode) indexRegBits() byte {
+	bits := a.extOp.srcBits()
+	if bits != 32 && bits != 64 {
+		panic("invalid index register for address mode. it must be either 32 or 64 bits")
+	}
+	return bits
+}
+
+func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) {
+	switch sizeInBits {
+	case 8:
+		lsl = 0
+	case 16:
+		lsl = 1
+	case 32:
+		lsl = 2
+	case 64:
+		lsl = 3
+	}
+	return
+}
+
+func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) {
+	switch op {
+	case ssa.OpcodeUload8:
+		size, signed = 8, false
+	case ssa.OpcodeUload16:
+		size, signed = 16, false
+	case ssa.OpcodeUload32:
+		size, signed = 32, false
+	case ssa.OpcodeSload8:
+		size, signed = 8, true
+	case ssa.OpcodeSload16:
+		size, signed = 16, true
+	case ssa.OpcodeSload32:
+		size, signed = 32, true
+	default:
+		panic("BUG")
+	}
+	return
+}
+
+func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) {
+	size, signed := extLoadSignSize(op)
+	amode := m.lowerToAddressMode(ptr, offset, size)
+	load := m.allocateInstr()
+	if signed {
+		load.asSLoad(operandNR(ret), amode, size)
+	} else {
+		load.asULoad(operandNR(ret), amode, size)
+	}
+	m.insert(load)
+}
+
+func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) {
+	amode := m.lowerToAddressMode(ptr, offset, typ.Bits())
+
+	dst := m.compiler.VRegOf(ret)
+	load := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32, ssa.TypeI64:
+		load.asULoad(operandNR(dst), amode, typ.Bits())
+	case ssa.TypeF32, ssa.TypeF64:
+		load.asFpuLoad(operandNR(dst), amode, typ.Bits())
+	case ssa.TypeV128:
+		load.asFpuLoad(operandNR(dst), amode, 128)
+	default:
+		panic("TODO")
+	}
+	m.insert(load)
+}
+
+func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) {
+	// vecLoad1R has offset address mode (base+imm) only for post index, so we simply add the offset to the base.
+	base := m.getOperand_NR(m.compiler.ValueDefinition(ptr), extModeNone).nr()
+	offsetReg := m.compiler.AllocateVReg(ssa.TypeI64)
+	m.lowerConstantI64(offsetReg, int64(offset))
+	addedBase := m.addReg64ToReg64(base, offsetReg)
+
+	rd := operandNR(m.compiler.VRegOf(ret))
+
+	ld1r := m.allocateInstr()
+	ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
+	m.insert(ld1r)
+}
+
+func (m *machine) lowerStore(si *ssa.Instruction) {
+	// TODO: merge consecutive stores into a single pair store instruction.
+	value, ptr, offset, storeSizeInBits := si.StoreData()
+	amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits)
+
+	valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone)
+	store := m.allocateInstr()
+	store.asStore(valueOp, amode, storeSizeInBits)
+	m.insert(store)
+}
+
+// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
+	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
+	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
+	// to support more efficient address resolution.
+
+	a32s, a64s, offset := m.collectAddends(ptr)
+	offset += int64(offsetBase)
+	return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset)
+}
+
+// lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends.
+// During the construction, this might emit additional instructions.
+//
+// Extracted as a separate function for easy testing.
+func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
+	switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
+	case a64sExist && a32sExist:
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		var a32 addend32
+		a32 = a32s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
+	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
+		offset = 0
+	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
+		offset = 0
+	case a64sExist:
+		var base regalloc.VReg
+		base = a64s.Dequeue()
+		if !a64s.Empty() {
+			index := a64s.Dequeue()
+			amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
+		} else {
+			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+		}
+	case a32sExist:
+		base32 := a32s.Dequeue()
+
+		// First we need 64-bit base.
+		base := m.compiler.AllocateVReg(ssa.TypeI64)
+		baseExt := m.allocateInstr()
+		var signed bool
+		if base32.ext == extendOpSXTW {
+			signed = true
+		}
+		baseExt.asExtend(base, base32.r, 32, 64, signed)
+		m.insert(baseExt)
+
+		if !a32s.Empty() {
+			index := a32s.Dequeue()
+			amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
+		} else {
+			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+		}
+	default: // Only static offsets.
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
+		m.lowerConstantI64(tmpReg, offset)
+		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
+		offset = 0
+	}
+
+	baseReg := amode.rn
+	if offset > 0 {
+		baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset
+	}
+
+	for !a64s.Empty() {
+		a64 := a64s.Dequeue()
+		baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64
+	}
+
+	for !a32s.Empty() {
+		a32 := a32s.Dequeue()
+		baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit)
+	}
+	amode.rn = baseReg
+	return
+}
+
+var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst}
+
+func (m *machine) collectAddends(ptr ssa.Value) (addends32 *wazevoapi.Queue[addend32], addends64 *wazevoapi.Queue[regalloc.VReg], offset int64) {
+	m.addendsWorkQueue.Reset()
+	m.addends32.Reset()
+	m.addends64.Reset()
+	m.addendsWorkQueue.Enqueue(ptr)
+
+	for !m.addendsWorkQueue.Empty() {
+		v := m.addendsWorkQueue.Dequeue()
+
+		def := m.compiler.ValueDefinition(v)
+		switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op {
+		case ssa.OpcodeIadd:
+			// If the addend is an add, we recursively collect its operands.
+			x, y := def.Instr.Arg2()
+			m.addendsWorkQueue.Enqueue(x)
+			m.addendsWorkQueue.Enqueue(y)
+			def.Instr.MarkLowered()
+		case ssa.OpcodeIconst:
+			// If the addend is constant, we just statically merge it into the offset.
+			ic := def.Instr
+			u64 := ic.ConstantVal()
+			if ic.Return().Type().Bits() == 32 {
+				offset += int64(int32(u64)) // sign-extend.
+			} else {
+				offset += int64(u64)
+			}
+			def.Instr.MarkLowered()
+		case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
+			input := def.Instr.Arg()
+			if input.Type().Bits() != 32 {
+				panic("illegal size: " + input.Type().String())
+			}
+
+			var ext extendOp
+			if op == ssa.OpcodeUExtend {
+				ext = extendOpUXTW
+			} else {
+				ext = extendOpSXTW
+			}
+
+			inputDef := m.compiler.ValueDefinition(input)
+			constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
+			switch {
+			case constInst && ext == extendOpUXTW:
+				// Zero-extension of a 32-bit constant can be merged into the offset.
+				offset += int64(uint32(inputDef.Instr.ConstantVal()))
+			case constInst && ext == extendOpSXTW:
+				// Sign-extension of a 32-bit constant can be merged into the offset.
+				offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend!
+			default:
+				m.addends32.Enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
+			}
+			def.Instr.MarkLowered()
+			continue
+		default:
+			// If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it.
+			m.addends64.Enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
+		}
+	}
+	return &m.addends32, &m.addends64, offset
+}
+
+func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
+		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
+	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
+		alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
+	} else {
+		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
+		m.load64bitConst(c, tmp)
+		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
+	}
+	m.insert(alu)
+	return
+}
+
+func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
+	m.insert(alu)
+	return
+}
+
+func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
+	rd = m.compiler.AllocateVReg(ssa.TypeI64)
+	alu := m.allocateInstr()
+	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
+	m.insert(alu)
+	return
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine.go
@ -0,0 +1,515 @@
+package arm64
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// machine implements backend.Machine.
+	machine struct {
+		compiler          backend.Compiler
+		executableContext *backend.ExecutableContextT[instruction]
+		currentABI        *backend.FunctionABI
+
+		regAlloc   regalloc.Allocator
+		regAllocFn *backend.RegAllocFunction[*instruction, *machine]
+
+		// addendsWorkQueue is used during address lowering, defined here for reuse.
+		addendsWorkQueue wazevoapi.Queue[ssa.Value]
+		addends32        wazevoapi.Queue[addend32]
+		// addends64 is used during address lowering, defined here for reuse.
+		addends64              wazevoapi.Queue[regalloc.VReg]
+		unresolvedAddressModes []*instruction
+
+		// condBrRelocs holds the conditional branches which need offset relocation.
+		condBrRelocs []condBrReloc
+
+		// jmpTableTargets holds the labels of the jump table targets.
+		jmpTableTargets [][]uint32
+
+		// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
+		// During the execution of the function, the stack looks like:
+		//
+		//
+		//            (high address)
+		//          +-----------------+
+		//          |     .......     |
+		//          |      ret Y      |
+		//          |     .......     |
+		//          |      ret 0      |
+		//          |      arg X      |
+		//          |     .......     |
+		//          |      arg 1      |
+		//          |      arg 0      |
+		//          |      xxxxx      |
+		//          |   ReturnAddress |
+		//          +-----------------+   <<-|
+		//          |   ...........   |      |
+		//          |   spill slot M  |      | <--- spillSlotSize
+		//          |   ............  |      |
+		//          |   spill slot 2  |      |
+		//          |   spill slot 1  |   <<-+
+		//          |   clobbered N   |
+		//          |   ...........   |
+		//          |   clobbered 1   |
+		//          |   clobbered 0   |
+		//   SP---> +-----------------+
+		//             (low address)
+		//
+		// and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16.
+		// Also note that this is only known after register allocation.
+		spillSlotSize int64
+		spillSlots    map[regalloc.VRegID]int64 // regalloc.VRegID to offset.
+		// clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue.
+		clobberedRegs []regalloc.VReg
+
+		maxRequiredStackSizeForCalls int64
+		stackBoundsCheckDisabled     bool
+
+		regAllocStarted bool
+	}
+
+	addend32 struct {
+		r   regalloc.VReg
+		ext extendOp
+	}
+
+	condBrReloc struct {
+		cbr *instruction
+		// currentLabelPos is the labelPosition within which condBr is defined.
+		currentLabelPos *labelPosition
+		// Next block's labelPosition.
+		nextLabel label
+		offset    int64
+	}
+
+	labelPosition = backend.LabelPosition[instruction]
+	label         = backend.Label
+)
+
+const (
+	labelReturn  = backend.LabelReturn
+	labelInvalid = backend.LabelInvalid
+)
+
+// NewBackend returns a new backend for arm64.
+func NewBackend() backend.Machine {
+	m := &machine{
+		spillSlots:        make(map[regalloc.VRegID]int64),
+		executableContext: newExecutableContext(),
+		regAlloc:          regalloc.NewAllocator(regInfo),
+	}
+	return m
+}
+
+func newExecutableContext() *backend.ExecutableContextT[instruction] {
+	return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0)
+}
+
+// ExecutableContext implements backend.Machine.
+func (m *machine) ExecutableContext() backend.ExecutableContext {
+	return m.executableContext
+}
+
+// RegAlloc implements backend.Machine Function.
+func (m *machine) RegAlloc() {
+	rf := m.regAllocFn
+	for _, pos := range m.executableContext.OrderedBlockLabels {
+		rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
+	}
+
+	m.regAllocStarted = true
+	m.regAlloc.DoAllocation(rf)
+	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
+	m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
+}
+
+// Reset implements backend.Machine.
+func (m *machine) Reset() {
+	m.clobberedRegs = m.clobberedRegs[:0]
+	for key := range m.spillSlots {
+		m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
+	}
+	for _, key := range m.clobberedRegs {
+		delete(m.spillSlots, regalloc.VRegID(key))
+	}
+	m.clobberedRegs = m.clobberedRegs[:0]
+	m.regAllocStarted = false
+	m.regAlloc.Reset()
+	m.regAllocFn.Reset()
+	m.spillSlotSize = 0
+	m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
+	m.maxRequiredStackSizeForCalls = 0
+	m.executableContext.Reset()
+	m.jmpTableTargets = m.jmpTableTargets[:0]
+}
+
+// SetCurrentABI implements backend.Machine SetCurrentABI.
+func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
+	m.currentABI = abi
+}
+
+// DisableStackCheck implements backend.Machine DisableStackCheck.
+func (m *machine) DisableStackCheck() {
+	m.stackBoundsCheckDisabled = true
+}
+
+// SetCompiler implements backend.Machine.
+func (m *machine) SetCompiler(ctx backend.Compiler) {
+	m.compiler = ctx
+	m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx)
+}
+
+func (m *machine) insert(i *instruction) {
+	ectx := m.executableContext
+	ectx.PendingInstructions = append(ectx.PendingInstructions, i)
+}
+
+func (m *machine) insertBrTargetLabel() label {
+	nop, l := m.allocateBrTarget()
+	m.insert(nop)
+	return l
+}
+
+func (m *machine) allocateBrTarget() (nop *instruction, l label) {
+	ectx := m.executableContext
+	l = ectx.AllocateLabel()
+	nop = m.allocateInstr()
+	nop.asNop0WithLabel(l)
+	pos := ectx.AllocateLabelPosition(l)
+	pos.Begin, pos.End = nop, nop
+	ectx.LabelPositions[l] = pos
+	return
+}
+
+// allocateInstr allocates an instruction.
+func (m *machine) allocateInstr() *instruction {
+	instr := m.executableContext.InstructionPool.Allocate()
+	if !m.regAllocStarted {
+		instr.addedBeforeRegAlloc = true
+	}
+	return instr
+}
+
+func resetInstruction(i *instruction) {
+	*i = instruction{}
+}
+
+func (m *machine) allocateNop() *instruction {
+	instr := m.allocateInstr()
+	instr.asNop0()
+	return instr
+}
+
+func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
+	amode := &i.amode
+	switch amode.kind {
+	case addressModeKindResultStackSpace:
+		amode.imm += ret0offset
+	case addressModeKindArgStackSpace:
+		amode.imm += arg0offset
+	default:
+		panic("BUG")
+	}
+
+	var sizeInBits byte
+	switch i.kind {
+	case store8, uLoad8:
+		sizeInBits = 8
+	case store16, uLoad16:
+		sizeInBits = 16
+	case store32, fpuStore32, uLoad32, fpuLoad32:
+		sizeInBits = 32
+	case store64, fpuStore64, uLoad64, fpuLoad64:
+		sizeInBits = 64
+	case fpuStore128, fpuLoad128:
+		sizeInBits = 128
+	default:
+		panic("BUG")
+	}
+
+	if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) {
+		amode.kind = addressModeKindRegUnsignedImm12
+	} else {
+		// This case, we load the offset into the temporary register,
+		// and then use it as the index register.
+		newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm)
+		linkInstr(newPrev, i)
+		*amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */}
+	}
+}
+
+// resolveRelativeAddresses resolves the relative addresses before encoding.
+func (m *machine) resolveRelativeAddresses(ctx context.Context) {
+	ectx := m.executableContext
+	for {
+		if len(m.unresolvedAddressModes) > 0 {
+			arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
+			for _, i := range m.unresolvedAddressModes {
+				m.resolveAddressingMode(arg0offset, ret0offset, i)
+			}
+		}
+
+		// Reuse the slice to gather the unresolved conditional branches.
+		m.condBrRelocs = m.condBrRelocs[:0]
+
+		var fn string
+		var fnIndex int
+		var labelToSSABlockID map[label]ssa.BasicBlockID
+		if wazevoapi.PerfMapEnabled {
+			fn = wazevoapi.GetCurrentFunctionName(ctx)
+			labelToSSABlockID = make(map[label]ssa.BasicBlockID)
+			for i, l := range ectx.SsaBlockIDToLabels {
+				labelToSSABlockID[l] = ssa.BasicBlockID(i)
+			}
+			fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
+		}
+
+		// Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
+		var offset int64
+		for i, pos := range ectx.OrderedBlockLabels {
+			pos.BinaryOffset = offset
+			var size int64
+			for cur := pos.Begin; ; cur = cur.next {
+				switch cur.kind {
+				case nop0:
+					l := cur.nop0Label()
+					if pos, ok := ectx.LabelPositions[l]; ok {
+						pos.BinaryOffset = offset + size
+					}
+				case condBr:
+					if !cur.condBrOffsetResolved() {
+						var nextLabel label
+						if i < len(ectx.OrderedBlockLabels)-1 {
+							// Note: this is only used when the block ends with fallthrough,
+							// therefore can be safely assumed that the next block exists when it's needed.
+							nextLabel = ectx.OrderedBlockLabels[i+1].L
+						}
+						m.condBrRelocs = append(m.condBrRelocs, condBrReloc{
+							cbr: cur, currentLabelPos: pos, offset: offset + size,
+							nextLabel: nextLabel,
+						})
+					}
+				}
+				size += cur.size()
+				if cur == pos.End {
+					break
+				}
+			}
+
+			if wazevoapi.PerfMapEnabled {
+				if size > 0 {
+					l := pos.L
+					var labelStr string
+					if blkID, ok := labelToSSABlockID[l]; ok {
+						labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
+					} else {
+						labelStr = l.String()
+					}
+					wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
+				}
+			}
+			offset += size
+		}
+
+		// Before resolving any offsets, we need to check if all the conditional branches can be resolved.
+		var needRerun bool
+		for i := range m.condBrRelocs {
+			reloc := &m.condBrRelocs[i]
+			cbr := reloc.cbr
+			offset := reloc.offset
+
+			target := cbr.condBrLabel()
+			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+			diff := offsetOfTarget - offset
+			if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
+				// This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
+				// and jump to it.
+				m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel)
+				// Then, we need to recall this function to fix up the label offsets
+				// as they have changed after the trampoline is inserted.
+				needRerun = true
+			}
+		}
+		if needRerun {
+			if wazevoapi.PerfMapEnabled {
+				wazevoapi.PerfMap.Clear()
+			}
+		} else {
+			break
+		}
+	}
+
+	var currentOffset int64
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch cur.kind {
+		case br:
+			target := cur.brLabel()
+			offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+			diff := offsetOfTarget - currentOffset
+			divided := diff >> 2
+			if divided < minSignedInt26 || divided > maxSignedInt26 {
+				// This means the currently compiled single function is extremely large.
+				panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range")
+			}
+			cur.brOffsetResolve(diff)
+		case condBr:
+			if !cur.condBrOffsetResolved() {
+				target := cur.condBrLabel()
+				offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
+				diff := offsetOfTarget - currentOffset
+				if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
+					panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
+				}
+				cur.condBrOffsetResolve(diff)
+			}
+		case brTableSequence:
+			tableIndex := cur.u1
+			targets := m.jmpTableTargets[tableIndex]
+			for i := range targets {
+				l := label(targets[i])
+				offsetOfTarget := ectx.LabelPositions[l].BinaryOffset
+				diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
+				targets[i] = uint32(diff)
+			}
+			cur.brTableSequenceOffsetsResolved()
+		case emitSourceOffsetInfo:
+			m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo())
+		}
+		currentOffset += cur.size()
+	}
+}
+
+const (
+	maxSignedInt26 = 1<<25 - 1
+	minSignedInt26 = -(1 << 25)
+
+	maxSignedInt19 = 1<<18 - 1
+	minSignedInt19 = -(1 << 18)
+)
+
+func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
+	cur := currentBlk.End
+	originalTarget := cbr.condBrLabel()
+	endNext := cur.next
+
+	if cur.kind != br {
+		// If the current block ends with a conditional branch, we can just insert the trampoline after it.
+		// Otherwise, we need to insert "skip" instruction to skip the trampoline instructions.
+		skip := m.allocateInstr()
+		skip.asBr(nextLabel)
+		cur = linkInstr(cur, skip)
+	}
+
+	cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget()
+	cbr.setCondBrTargets(cbrNewTargetLabel)
+	cur = linkInstr(cur, cbrNewTargetInstr)
+
+	// Then insert the unconditional branch to the original, which should be possible to get encoded
+	// as 26-bit offset should be enough for any practical application.
+	br := m.allocateInstr()
+	br.asBr(originalTarget)
+	cur = linkInstr(cur, br)
+
+	// Update the end of the current block.
+	currentBlk.End = cur
+
+	linkInstr(cur, endNext)
+}
+
+// Format implements backend.Machine.
+func (m *machine) Format() string {
+	ectx := m.executableContext
+	begins := map[*instruction]label{}
+	for l, pos := range ectx.LabelPositions {
+		begins[pos.Begin] = l
+	}
+
+	irBlocks := map[label]ssa.BasicBlockID{}
+	for i, l := range ectx.SsaBlockIDToLabels {
+		irBlocks[l] = ssa.BasicBlockID(i)
+	}
+
+	var lines []string
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		if l, ok := begins[cur]; ok {
+			var labelStr string
+			if blkID, ok := irBlocks[l]; ok {
+				labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
+			} else {
+				labelStr = fmt.Sprintf("%s:", l)
+			}
+			lines = append(lines, labelStr)
+		}
+		if cur.kind == nop0 {
+			continue
+		}
+		lines = append(lines, "\t"+cur.String())
+	}
+	return "\n" + strings.Join(lines, "\n") + "\n"
+}
+
+// InsertReturn implements backend.Machine.
+func (m *machine) InsertReturn() {
+	i := m.allocateInstr()
+	i.asRet()
+	m.insert(i)
+}
+
+func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
+	offset, ok := m.spillSlots[id]
+	if !ok {
+		offset = m.spillSlotSize
+		// TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible.
+		m.spillSlots[id] = offset
+		m.spillSlotSize += int64(size)
+	}
+	return offset + 16 // spill slot starts above the clobbered registers and the frame size.
+}
+
+func (m *machine) clobberedRegSlotSize() int64 {
+	return int64(len(m.clobberedRegs) * 16)
+}
+
+func (m *machine) arg0OffsetFromSP() int64 {
+	return m.frameSize() +
+		16 + // 16-byte aligned return address
+		16 // frame size saved below the clobbered registers.
+}
+
+func (m *machine) ret0OffsetFromSP() int64 {
+	return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize
+}
+
+func (m *machine) requiredStackSize() int64 {
+	return m.maxRequiredStackSizeForCalls +
+		m.frameSize() +
+		16 + // 16-byte aligned return address.
+		16 // frame size saved below the clobbered registers.
+}
+
+func (m *machine) frameSize() int64 {
+	s := m.clobberedRegSlotSize() + m.spillSlotSize
+	if s&0xf != 0 {
+		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
+	}
+	return s
+}
+
+func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
+	// TODO: reuse the slice!
+	labels := make([]uint32, len(targets))
+	for j, target := range targets {
+		labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target))
+	}
+	index = len(m.jmpTableTargets)
+	m.jmpTableTargets = append(m.jmpTableTargets, labels)
+	return
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@ -0,0 +1,469 @@
+package arm64
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+// PostRegAlloc implements backend.Machine.
+func (m *machine) PostRegAlloc() {
+	m.setupPrologue()
+	m.postRegAlloc()
+}
+
+// setupPrologue initializes the prologue of the function.
+func (m *machine) setupPrologue() {
+	ectx := m.executableContext
+
+	cur := ectx.RootInstr
+	prevInitInst := cur.next
+
+	//
+	//                   (high address)                    (high address)
+	//         SP----> +-----------------+               +------------------+ <----+
+	//                 |     .......     |               |     .......      |      |
+	//                 |      ret Y      |               |      ret Y       |      |
+	//                 |     .......     |               |     .......      |      |
+	//                 |      ret 0      |               |      ret 0       |      |
+	//                 |      arg X      |               |      arg X       |      |  size_of_arg_ret.
+	//                 |     .......     |     ====>     |     .......      |      |
+	//                 |      arg 1      |               |      arg 1       |      |
+	//                 |      arg 0      |               |      arg 0       | <----+
+	//                 |-----------------|               |  size_of_arg_ret |
+	//                                                   |  return address  |
+	//                                                   +------------------+ <---- SP
+	//                    (low address)                     (low address)
+
+	// Saves the return address (lr) and the size_of_arg_ret below the SP.
+	// size_of_arg_ret is used for stack unwinding.
+	cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
+
+	if !m.stackBoundsCheckDisabled {
+		cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
+	}
+
+	// Decrement SP if spillSlotSize > 0.
+	if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
+		panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
+	}
+
+	if regs := m.clobberedRegs; len(regs) > 0 {
+		//
+		//            (high address)                  (high address)
+		//          +-----------------+             +-----------------+
+		//          |     .......     |             |     .......     |
+		//          |      ret Y      |             |      ret Y      |
+		//          |     .......     |             |     .......     |
+		//          |      ret 0      |             |      ret 0      |
+		//          |      arg X      |             |      arg X      |
+		//          |     .......     |             |     .......     |
+		//          |      arg 1      |             |      arg 1      |
+		//          |      arg 0      |             |      arg 0      |
+		//          | size_of_arg_ret |             | size_of_arg_ret |
+		//          |   ReturnAddress |             |  ReturnAddress  |
+		//  SP----> +-----------------+    ====>    +-----------------+
+		//             (low address)                |   clobbered M   |
+		//                                          |   ............  |
+		//                                          |   clobbered 0   |
+		//                                          +-----------------+ <----- SP
+		//                                             (low address)
+		//
+		_amode := addressModePreOrPostIndex(spVReg,
+			-16,  // stack pointer must be 16-byte aligned.
+			true, // Decrement before store.
+		)
+		for _, vr := range regs {
+			// TODO: pair stores to reduce the number of instructions.
+			store := m.allocateInstr()
+			store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
+			cur = linkInstr(cur, store)
+		}
+	}
+
+	if size := m.spillSlotSize; size > 0 {
+		// Check if size is 16-byte aligned.
+		if size&0xf != 0 {
+			panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
+		}
+
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
+
+		// At this point, the stack looks like:
+		//
+		//            (high address)
+		//          +------------------+
+		//          |     .......      |
+		//          |      ret Y       |
+		//          |     .......      |
+		//          |      ret 0       |
+		//          |      arg X       |
+		//          |     .......      |
+		//          |      arg 1       |
+		//          |      arg 0       |
+		//          |  size_of_arg_ret |
+		//          |   ReturnAddress  |
+		//          +------------------+
+		//          |    clobbered M   |
+		//          |   ............   |
+		//          |    clobbered 0   |
+		//          |   spill slot N   |
+		//          |   ............   |
+		//          |   spill slot 2   |
+		//          |   spill slot 0   |
+		//  SP----> +------------------+
+		//             (low address)
+	}
+
+	// We push the frame size into the stack to make it possible to unwind stack:
+	//
+	//
+	//            (high address)                  (high address)
+	//         +-----------------+                +-----------------+
+	//         |     .......     |                |     .......     |
+	//         |      ret Y      |                |      ret Y      |
+	//         |     .......     |                |     .......     |
+	//         |      ret 0      |                |      ret 0      |
+	//         |      arg X      |                |      arg X      |
+	//         |     .......     |                |     .......     |
+	//         |      arg 1      |                |      arg 1      |
+	//         |      arg 0      |                |      arg 0      |
+	//         | size_of_arg_ret |                | size_of_arg_ret |
+	//         |  ReturnAddress  |                |  ReturnAddress  |
+	//         +-----------------+      ==>       +-----------------+ <----+
+	//         |   clobbered  M  |                |   clobbered  M  |      |
+	//         |   ............  |                |   ............  |      |
+	//         |   clobbered  2  |                |   clobbered  2  |      |
+	//         |   clobbered  1  |                |   clobbered  1  |      | frame size
+	//         |   clobbered  0  |                |   clobbered  0  |      |
+	//         |   spill slot N  |                |   spill slot N  |      |
+	//         |   ............  |                |   ............  |      |
+	//         |   spill slot 0  |                |   spill slot 0  | <----+
+	// SP--->  +-----------------+                |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
+	//                                            |   frame_size    |
+	//                                            +-----------------+ <---- SP
+	//            (low address)
+	//
+	cur = m.createFrameSizeSlot(cur, m.frameSize())
+
+	linkInstr(cur, prevInitInst)
+}
+
+func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
+	// First we decrement the stack pointer to point the arg0 slot.
+	var sizeOfArgRetReg regalloc.VReg
+	s := int64(m.currentABI.AlignedArgResultStackSlotSize())
+	if s > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
+		sizeOfArgRetReg = tmpRegVReg
+
+		subSp := m.allocateInstr()
+		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
+		cur = linkInstr(cur, subSp)
+	} else {
+		sizeOfArgRetReg = xzrVReg
+	}
+
+	// Saves the return address (lr) and the size_of_arg_ret below the SP.
+	// size_of_arg_ret is used for stack unwinding.
+	pstr := m.allocateInstr()
+	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
+	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
+	cur = linkInstr(cur, pstr)
+	return cur
+}
+
+func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
+	var frameSizeReg regalloc.VReg
+	if s > 0 {
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
+		frameSizeReg = tmpRegVReg
+	} else {
+		frameSizeReg = xzrVReg
+	}
+	_amode := addressModePreOrPostIndex(spVReg,
+		-16,  // stack pointer must be 16-byte aligned.
+		true, // Decrement before store.
+	)
+	store := m.allocateInstr()
+	store.asStore(operandNR(frameSizeReg), _amode, 64)
+	cur = linkInstr(cur, store)
+	return cur
+}
+
+// postRegAlloc does multiple things while walking through the instructions:
+// 1. Removes the redundant copy instruction.
+// 2. Inserts the epilogue.
+func (m *machine) postRegAlloc() {
+	ectx := m.executableContext
+	for cur := ectx.RootInstr; cur != nil; cur = cur.next {
+		switch cur.kind {
+		case ret:
+			m.setupEpilogueAfter(cur.prev)
+		case loadConstBlockArg:
+			lc := cur
+			next := lc.next
+			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+			m.lowerLoadConstantBlockArgAfterRegAlloc(lc)
+			for _, instr := range m.executableContext.PendingInstructions {
+				cur = linkInstr(cur, instr)
+			}
+			linkInstr(cur, next)
+			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
+		default:
+			// Removes the redundant copy instruction.
+			if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
+				prev, next := cur.prev, cur.next
+				// Remove the copy instruction.
+				prev.next = next
+				if next != nil {
+					next.prev = prev
+				}
+			}
+		}
+	}
+}
+
+func (m *machine) setupEpilogueAfter(cur *instruction) {
+	prevNext := cur.next
+
+	// We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
+	cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
+
+	if s := m.spillSlotSize; s > 0 {
+		// Adjust SP to the original value:
+		//
+		//            (high address)                        (high address)
+		//          +-----------------+                  +-----------------+
+		//          |     .......     |                  |     .......     |
+		//          |      ret Y      |                  |      ret Y      |
+		//          |     .......     |                  |     .......     |
+		//          |      ret 0      |                  |      ret 0      |
+		//          |      arg X      |                  |      arg X      |
+		//          |     .......     |                  |     .......     |
+		//          |      arg 1      |                  |      arg 1      |
+		//          |      arg 0      |                  |      arg 0      |
+		//          |      xxxxx      |                  |      xxxxx      |
+		//          |   ReturnAddress |                  |   ReturnAddress |
+		//          +-----------------+      ====>       +-----------------+
+		//          |    clobbered M  |                  |    clobbered M  |
+		//          |   ............  |                  |   ............  |
+		//          |    clobbered 1  |                  |    clobbered 1  |
+		//          |    clobbered 0  |                  |    clobbered 0  |
+		//          |   spill slot N  |                  +-----------------+ <---- SP
+		//          |   ............  |
+		//          |   spill slot 0  |
+		//   SP---> +-----------------+
+		//             (low address)
+		//
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	// First we need to restore the clobbered registers.
+	if len(m.clobberedRegs) > 0 {
+		//            (high address)
+		//          +-----------------+                      +-----------------+
+		//          |     .......     |                      |     .......     |
+		//          |      ret Y      |                      |      ret Y      |
+		//          |     .......     |                      |     .......     |
+		//          |      ret 0      |                      |      ret 0      |
+		//          |      arg X      |                      |      arg X      |
+		//          |     .......     |                      |     .......     |
+		//          |      arg 1      |                      |      arg 1      |
+		//          |      arg 0      |                      |      arg 0      |
+		//          |      xxxxx      |                      |      xxxxx      |
+		//          |   ReturnAddress |                      |   ReturnAddress |
+		//          +-----------------+      ========>       +-----------------+ <---- SP
+		//          |   clobbered M   |
+		//          |   ...........   |
+		//          |   clobbered 1   |
+		//          |   clobbered 0   |
+		//   SP---> +-----------------+
+		//             (low address)
+
+		l := len(m.clobberedRegs) - 1
+		for i := range m.clobberedRegs {
+			vr := m.clobberedRegs[l-i] // reverse order to restore.
+			load := m.allocateInstr()
+			amode := addressModePreOrPostIndex(spVReg,
+				16,    // stack pointer must be 16-byte aligned.
+				false, // Increment after store.
+			)
+			// TODO: pair loads to reduce the number of instructions.
+			switch regTypeToRegisterSizeInBits(vr.RegType()) {
+			case 64: // save int reg.
+				load.asULoad(operandNR(vr), amode, 64)
+			case 128: // save vector reg.
+				load.asFpuLoad(operandNR(vr), amode, 128)
+			}
+			cur = linkInstr(cur, load)
+		}
+	}
+
+	// Reload the return address (lr).
+	//
+	//            +-----------------+          +-----------------+
+	//            |     .......     |          |     .......     |
+	//            |      ret Y      |          |      ret Y      |
+	//            |     .......     |          |     .......     |
+	//            |      ret 0      |          |      ret 0      |
+	//            |      arg X      |          |      arg X      |
+	//            |     .......     |   ===>   |     .......     |
+	//            |      arg 1      |          |      arg 1      |
+	//            |      arg 0      |          |      arg 0      |
+	//            |      xxxxx      |          +-----------------+ <---- SP
+	//            |  ReturnAddress  |
+	//    SP----> +-----------------+
+
+	ldr := m.allocateInstr()
+	ldr.asULoad(operandNR(lrVReg),
+		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	cur = linkInstr(cur, ldr)
+
+	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
+		cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
+	}
+
+	linkInstr(cur, prevNext)
+}
+
+// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
+// stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
+// which always points to the execution context whenever the native code is entered from Go.
+var saveRequiredRegs = []regalloc.VReg{
+	x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
+	x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
+	v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
+	v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
+}
+
+// insertStackBoundsCheck will insert the instructions after `cur` to check the
+// stack bounds, and if there's no sufficient spaces required for the function,
+// exit the execution and try growing it in Go world.
+//
+// TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
+func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
+	if requiredStackSize%16 != 0 {
+		panic("BUG")
+	}
+
+	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
+		// sub tmp, sp, #requiredStackSize
+		sub := m.allocateInstr()
+		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
+		cur = linkInstr(cur, sub)
+	} else {
+		// This case, we first load the requiredStackSize into the temporary register,
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
+		// Then subtract it.
+		sub := m.allocateInstr()
+		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		cur = linkInstr(cur, sub)
+	}
+
+	tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue.
+
+	// ldr tmp2, [executionContext #StackBottomPtr]
+	ldr := m.allocateInstr()
+	ldr.asULoad(operandNR(tmp2), addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   x0VReg, // execution context is always the first argument.
+		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
+	}, 64)
+	cur = linkInstr(cur, ldr)
+
+	// subs xzr, tmp, tmp2
+	subs := m.allocateInstr()
+	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
+	cur = linkInstr(cur, subs)
+
+	// b.ge #imm
+	cbr := m.allocateInstr()
+	cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */)
+	cur = linkInstr(cur, cbr)
+
+	// Set the required stack size and set it to the exec context.
+	{
+		// First load the requiredStackSize into the temporary register,
+		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
+		setRequiredStackSize := m.allocateInstr()
+		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
+			addressMode{
+				kind: addressModeKindRegUnsignedImm12,
+				// Execution context is always the first argument.
+				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
+			}, 64)
+
+		cur = linkInstr(cur, setRequiredStackSize)
+	}
+
+	ldrAddress := m.allocateInstr()
+	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   x0VReg, // execution context is always the first argument
+		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
+	}, 64)
+	cur = linkInstr(cur, ldrAddress)
+
+	// Then jumps to the stack grow call sequence's address, meaning
+	// transferring the control to the code compiled by CompileStackGrowCallSequence.
+	bl := m.allocateInstr()
+	bl.asCallIndirect(tmpRegVReg, nil)
+	cur = linkInstr(cur, bl)
+
+	// Now that we know the entire code, we can finalize how many bytes
+	// we have to skip when the stack size is sufficient.
+	var cbrOffset int64
+	for _cur := cbr; ; _cur = _cur.next {
+		cbrOffset += _cur.size()
+		if _cur == cur {
+			break
+		}
+	}
+	cbr.condBrOffsetResolve(cbrOffset)
+	return cur
+}
+
+// CompileStackGrowCallSequence implements backend.Machine.
+func (m *machine) CompileStackGrowCallSequence() []byte {
+	ectx := m.executableContext
+
+	cur := m.allocateInstr()
+	cur.asNop0()
+	ectx.RootInstr = cur
+
+	// Save the callee saved and argument registers.
+	cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
+
+	// Save the current stack pointer.
+	cur = m.saveCurrentStackPointer(cur, x0VReg)
+
+	// Set the exit status on the execution context.
+	cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
+
+	// Exit the execution.
+	cur = m.storeReturnAddressAndExit(cur)
+
+	// After the exit, restore the saved registers.
+	cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
+
+	// Then goes back the original address of this stack grow call.
+	ret := m.allocateInstr()
+	ret.asRet()
+	linkInstr(cur, ret)
+
+	m.encode(ectx.RootInstr)
+	return m.compiler.Buf()
+}
+
+func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
+	ectx := m.executableContext
+
+	ectx.PendingInstructions = ectx.PendingInstructions[:0]
+	m.insertAddOrSubStackPointer(rd, diff, add)
+	for _, inserted := range ectx.PendingInstructions {
+		cur = linkInstr(cur, inserted)
+	}
+	return cur
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@ -0,0 +1,152 @@
+package arm64
+
+// This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine.
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// ClobberedRegisters implements backend.RegAllocFunctionMachine.
+func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
+	m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
+}
+
+// Swap implements backend.RegAllocFunctionMachine.
+func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
+	prevNext := cur.next
+	var mov1, mov2, mov3 *instruction
+	if x1.RegType() == regalloc.RegTypeInt {
+		if !tmp.Valid() {
+			tmp = tmpRegVReg
+		}
+		mov1 = m.allocateInstr().asMove64(tmp, x1)
+		mov2 = m.allocateInstr().asMove64(x1, x2)
+		mov3 = m.allocateInstr().asMove64(x2, tmp)
+		cur = linkInstr(cur, mov1)
+		cur = linkInstr(cur, mov2)
+		cur = linkInstr(cur, mov3)
+		linkInstr(cur, prevNext)
+	} else {
+		if !tmp.Valid() {
+			r2 := x2.RealReg()
+			// Temporarily spill x1 to stack.
+			cur = m.InsertStoreRegisterAt(x1, cur, true).prev
+			// Then move x2 to x1.
+			cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2))
+			linkInstr(cur, prevNext)
+			// Then reload the original value on x1 from stack to r2.
+			m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
+		} else {
+			mov1 = m.allocateInstr().asFpuMov128(tmp, x1)
+			mov2 = m.allocateInstr().asFpuMov128(x1, x2)
+			mov3 = m.allocateInstr().asFpuMov128(x2, tmp)
+			cur = linkInstr(cur, mov1)
+			cur = linkInstr(cur, mov2)
+			cur = linkInstr(cur, mov3)
+			linkInstr(cur, prevNext)
+		}
+	}
+}
+
+// InsertMoveBefore implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
+	typ := src.RegType()
+	if typ != dst.RegType() {
+		panic("BUG: src and dst must have the same type")
+	}
+
+	mov := m.allocateInstr()
+	if typ == regalloc.RegTypeInt {
+		mov.asMove64(dst, src)
+	} else {
+		mov.asFpuMov128(dst, src)
+	}
+
+	cur := instr.prev
+	prevNext := cur.next
+	cur = linkInstr(cur, mov)
+	linkInstr(cur, prevNext)
+}
+
+// SSABlockLabel implements backend.RegAllocFunctionMachine.
+func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
+	return m.executableContext.SsaBlockIDToLabels[id]
+}
+
+// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.compiler.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	var amode addressMode
+	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
+	store := m.allocateInstr()
+	store.asStore(operandNR(v), amode, typ.Bits())
+
+	cur = linkInstr(cur, store)
+	return linkInstr(cur, prevNext)
+}
+
+// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
+func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
+	if !v.IsRealReg() {
+		panic("BUG: VReg must be backed by real reg to be stored")
+	}
+
+	typ := m.compiler.TypeOf(v)
+
+	var prevNext, cur *instruction
+	if after {
+		cur, prevNext = instr, instr.next
+	} else {
+		cur, prevNext = instr.prev, instr
+	}
+
+	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
+	var amode addressMode
+	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
+	load := m.allocateInstr()
+	switch typ {
+	case ssa.TypeI32, ssa.TypeI64:
+		load.asULoad(operandNR(v), amode, typ.Bits())
+	case ssa.TypeF32, ssa.TypeF64:
+		load.asFpuLoad(operandNR(v), amode, typ.Bits())
+	case ssa.TypeV128:
+		load.asFpuLoad(operandNR(v), amode, 128)
+	default:
+		panic("TODO")
+	}
+
+	cur = linkInstr(cur, load)
+	return linkInstr(cur, prevNext)
+}
+
+// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
+func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
+	cur := end
+	for cur.kind == nop0 {
+		cur = cur.prev
+		if cur == begin {
+			return end
+		}
+	}
+	switch cur.kind {
+	case br:
+		return cur
+	default:
+		return end
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/machine_relocation.go
@ -0,0 +1,117 @@
+package arm64
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math"
+	"sort"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
+)
+
+const (
+	// trampolineCallSize is the size of the trampoline instruction sequence for each function in an island.
+	trampolineCallSize = 4*4 + 4 // Four instructions + 32-bit immediate.
+
+	// Unconditional branch offset is encoded as divided by 4 in imm26.
+	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
+
+	maxUnconditionalBranchOffset = maxSignedInt26 * 4
+	minUnconditionalBranchOffset = minSignedInt26 * 4
+
+	// trampolineIslandInterval is the range of the trampoline island.
+	// Half of the range is used for the trampoline island, and the other half is used for the function.
+	trampolineIslandInterval = maxUnconditionalBranchOffset / 2
+
+	// maxNumFunctions explicitly specifies the maximum number of functions that can be allowed in a single executable.
+	maxNumFunctions = trampolineIslandInterval >> 6
+
+	// maxFunctionExecutableSize is the maximum size of a function that can exist in a trampoline island.
+	// Conservatively set to 1/4 of the trampoline island interval.
+	maxFunctionExecutableSize = trampolineIslandInterval >> 2
+)
+
+// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
+func (m *machine) CallTrampolineIslandInfo(numFunctions int) (interval, size int, err error) {
+	if numFunctions > maxNumFunctions {
+		return 0, 0, fmt.Errorf("too many functions: %d > %d", numFunctions, maxNumFunctions)
+	}
+	return trampolineIslandInterval, trampolineCallSize * numFunctions, nil
+}
+
+// ResolveRelocations implements backend.Machine ResolveRelocations.
+func (m *machine) ResolveRelocations(
+	refToBinaryOffset []int,
+	executable []byte,
+	relocations []backend.RelocationInfo,
+	callTrampolineIslandOffsets []int,
+) {
+	for _, islandOffset := range callTrampolineIslandOffsets {
+		encodeCallTrampolineIsland(refToBinaryOffset, islandOffset, executable)
+	}
+
+	for _, r := range relocations {
+		instrOffset := r.Offset
+		calleeFnOffset := refToBinaryOffset[r.FuncRef]
+		diff := int64(calleeFnOffset) - (instrOffset)
+		// Check if the diff is within the range of the branch instruction.
+		if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
+			// Find the near trampoline island from callTrampolineIslandOffsets.
+			islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset))
+			islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef)
+			diff = int64(islandTargetOffset) - (instrOffset)
+			if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
+				panic("BUG in trampoline placement")
+			}
+		}
+		binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff))
+	}
+}
+
+// encodeCallTrampolineIsland encodes a trampoline island for the given functions.
+// Each island consists of a trampoline instruction sequence for each function.
+// Each trampoline instruction sequence consists of 4 instructions + 32-bit immediate.
+func encodeCallTrampolineIsland(refToBinaryOffset []int, islandOffset int, executable []byte) {
+	for i := 0; i < len(refToBinaryOffset); i++ {
+		trampolineOffset := islandOffset + trampolineCallSize*i
+
+		fnOffset := refToBinaryOffset[i]
+		diff := fnOffset - (trampolineOffset + 16)
+		if diff > math.MaxInt32 || diff < math.MinInt32 {
+			// This case even amd64 can't handle. 4GB is too big.
+			panic("too big binary")
+		}
+
+		// The tmpReg, tmpReg2 is safe to overwrite (in fact any caller-saved register is safe to use).
+		tmpReg, tmpReg2 := regNumberInEncoding[tmpRegVReg.RealReg()], regNumberInEncoding[x11]
+
+		// adr tmpReg, PC+16: load the address of #diff into tmpReg.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset:], encodeAdr(tmpReg, 16))
+		// ldrsw tmpReg2, [tmpReg]: Load #diff into tmpReg2.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+4:],
+			encodeLoadOrStore(sLoad32, tmpReg2, addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpRegVReg}))
+		// add tmpReg, tmpReg2, tmpReg: add #diff to the address of #diff, getting the absolute address of the function.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+8:],
+			encodeAluRRR(aluOpAdd, tmpReg, tmpReg, tmpReg2, true, false))
+		// br tmpReg: branch to the function without overwriting the link register.
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+12:], encodeUnconditionalBranchReg(tmpReg, false))
+		// #diff
+		binary.LittleEndian.PutUint32(executable[trampolineOffset+16:], uint32(diff))
+	}
+}
+
+// searchTrampolineIsland finds the nearest trampoline island from callTrampolineIslandOffsets.
+// Note that even if the offset is in the middle of two islands, it returns the latter one.
+// That is ok because the island is always placed in the middle of the range.
+//
+// precondition: callTrampolineIslandOffsets is sorted in ascending order.
+func searchTrampolineIsland(callTrampolineIslandOffsets []int, offset int) int {
+	l := len(callTrampolineIslandOffsets)
+	n := sort.Search(l, func(i int) bool {
+		return callTrampolineIslandOffsets[i] >= offset
+	})
+	if n == l {
+		n = l - 1
+	}
+	return callTrampolineIslandOffsets[n]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/reg.go
@ -0,0 +1,397 @@
+package arm64
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+)
+
+// Arm64-specific registers.
+//
+// See https://developer.arm.com/documentation/dui0801/a/Overview-of-AArch64-state/Predeclared-core-register-names-in-AArch64-state
+
+const (
+	// General purpose registers. Note that we do not distinguish wn and xn registers
+	// because they are the same from the perspective of register allocator, and
+	// the size can be determined by the type of the instruction.
+
+	x0 = regalloc.RealRegInvalid + 1 + iota
+	x1
+	x2
+	x3
+	x4
+	x5
+	x6
+	x7
+	x8
+	x9
+	x10
+	x11
+	x12
+	x13
+	x14
+	x15
+	x16
+	x17
+	x18
+	x19
+	x20
+	x21
+	x22
+	x23
+	x24
+	x25
+	x26
+	x27
+	x28
+	x29
+	x30
+
+	// Vector registers. Note that we do not distinguish vn and dn, ... registers
+	// because they are the same from the perspective of register allocator, and
+	// the size can be determined by the type of the instruction.
+
+	v0
+	v1
+	v2
+	v3
+	v4
+	v5
+	v6
+	v7
+	v8
+	v9
+	v10
+	v11
+	v12
+	v13
+	v14
+	v15
+	v16
+	v17
+	v18
+	v19
+	v20
+	v21
+	v22
+	v23
+	v24
+	v25
+	v26
+	v27
+	v28
+	v29
+	v30
+	v31
+
+	// Special registers
+
+	xzr
+	sp
+	lr  = x30
+	fp  = x29
+	tmp = x27
+)
+
+var (
+	x0VReg  = regalloc.FromRealReg(x0, regalloc.RegTypeInt)
+	x1VReg  = regalloc.FromRealReg(x1, regalloc.RegTypeInt)
+	x2VReg  = regalloc.FromRealReg(x2, regalloc.RegTypeInt)
+	x3VReg  = regalloc.FromRealReg(x3, regalloc.RegTypeInt)
+	x4VReg  = regalloc.FromRealReg(x4, regalloc.RegTypeInt)
+	x5VReg  = regalloc.FromRealReg(x5, regalloc.RegTypeInt)
+	x6VReg  = regalloc.FromRealReg(x6, regalloc.RegTypeInt)
+	x7VReg  = regalloc.FromRealReg(x7, regalloc.RegTypeInt)
+	x8VReg  = regalloc.FromRealReg(x8, regalloc.RegTypeInt)
+	x9VReg  = regalloc.FromRealReg(x9, regalloc.RegTypeInt)
+	x10VReg = regalloc.FromRealReg(x10, regalloc.RegTypeInt)
+	x11VReg = regalloc.FromRealReg(x11, regalloc.RegTypeInt)
+	x12VReg = regalloc.FromRealReg(x12, regalloc.RegTypeInt)
+	x13VReg = regalloc.FromRealReg(x13, regalloc.RegTypeInt)
+	x14VReg = regalloc.FromRealReg(x14, regalloc.RegTypeInt)
+	x15VReg = regalloc.FromRealReg(x15, regalloc.RegTypeInt)
+	x16VReg = regalloc.FromRealReg(x16, regalloc.RegTypeInt)
+	x17VReg = regalloc.FromRealReg(x17, regalloc.RegTypeInt)
+	x18VReg = regalloc.FromRealReg(x18, regalloc.RegTypeInt)
+	x19VReg = regalloc.FromRealReg(x19, regalloc.RegTypeInt)
+	x20VReg = regalloc.FromRealReg(x20, regalloc.RegTypeInt)
+	x21VReg = regalloc.FromRealReg(x21, regalloc.RegTypeInt)
+	x22VReg = regalloc.FromRealReg(x22, regalloc.RegTypeInt)
+	x23VReg = regalloc.FromRealReg(x23, regalloc.RegTypeInt)
+	x24VReg = regalloc.FromRealReg(x24, regalloc.RegTypeInt)
+	x25VReg = regalloc.FromRealReg(x25, regalloc.RegTypeInt)
+	x26VReg = regalloc.FromRealReg(x26, regalloc.RegTypeInt)
+	x27VReg = regalloc.FromRealReg(x27, regalloc.RegTypeInt)
+	x28VReg = regalloc.FromRealReg(x28, regalloc.RegTypeInt)
+	x29VReg = regalloc.FromRealReg(x29, regalloc.RegTypeInt)
+	x30VReg = regalloc.FromRealReg(x30, regalloc.RegTypeInt)
+	v0VReg  = regalloc.FromRealReg(v0, regalloc.RegTypeFloat)
+	v1VReg  = regalloc.FromRealReg(v1, regalloc.RegTypeFloat)
+	v2VReg  = regalloc.FromRealReg(v2, regalloc.RegTypeFloat)
+	v3VReg  = regalloc.FromRealReg(v3, regalloc.RegTypeFloat)
+	v4VReg  = regalloc.FromRealReg(v4, regalloc.RegTypeFloat)
+	v5VReg  = regalloc.FromRealReg(v5, regalloc.RegTypeFloat)
+	v6VReg  = regalloc.FromRealReg(v6, regalloc.RegTypeFloat)
+	v7VReg  = regalloc.FromRealReg(v7, regalloc.RegTypeFloat)
+	v8VReg  = regalloc.FromRealReg(v8, regalloc.RegTypeFloat)
+	v9VReg  = regalloc.FromRealReg(v9, regalloc.RegTypeFloat)
+	v10VReg = regalloc.FromRealReg(v10, regalloc.RegTypeFloat)
+	v11VReg = regalloc.FromRealReg(v11, regalloc.RegTypeFloat)
+	v12VReg = regalloc.FromRealReg(v12, regalloc.RegTypeFloat)
+	v13VReg = regalloc.FromRealReg(v13, regalloc.RegTypeFloat)
+	v14VReg = regalloc.FromRealReg(v14, regalloc.RegTypeFloat)
+	v15VReg = regalloc.FromRealReg(v15, regalloc.RegTypeFloat)
+	v16VReg = regalloc.FromRealReg(v16, regalloc.RegTypeFloat)
+	v17VReg = regalloc.FromRealReg(v17, regalloc.RegTypeFloat)
+	v18VReg = regalloc.FromRealReg(v18, regalloc.RegTypeFloat)
+	v19VReg = regalloc.FromRealReg(v19, regalloc.RegTypeFloat)
+	v20VReg = regalloc.FromRealReg(v20, regalloc.RegTypeFloat)
+	v21VReg = regalloc.FromRealReg(v21, regalloc.RegTypeFloat)
+	v22VReg = regalloc.FromRealReg(v22, regalloc.RegTypeFloat)
+	v23VReg = regalloc.FromRealReg(v23, regalloc.RegTypeFloat)
+	v24VReg = regalloc.FromRealReg(v24, regalloc.RegTypeFloat)
+	v25VReg = regalloc.FromRealReg(v25, regalloc.RegTypeFloat)
+	v26VReg = regalloc.FromRealReg(v26, regalloc.RegTypeFloat)
+	v27VReg = regalloc.FromRealReg(v27, regalloc.RegTypeFloat)
+	// lr (link register) holds the return address at the function entry.
+	lrVReg = x30VReg
+	// tmpReg is used to perform spill/load on large stack offsets, and load large constants.
+	// Therefore, be cautious to use this register in the middle of the compilation, especially before the register allocation.
+	// This is the same as golang/go, but it's only described in the source code:
+	// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L59
+	// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L13-L15
+	tmpRegVReg = regalloc.FromRealReg(tmp, regalloc.RegTypeInt)
+	v28VReg    = regalloc.FromRealReg(v28, regalloc.RegTypeFloat)
+	v29VReg    = regalloc.FromRealReg(v29, regalloc.RegTypeFloat)
+	v30VReg    = regalloc.FromRealReg(v30, regalloc.RegTypeFloat)
+	v31VReg    = regalloc.FromRealReg(v31, regalloc.RegTypeFloat)
+	xzrVReg    = regalloc.FromRealReg(xzr, regalloc.RegTypeInt)
+	spVReg     = regalloc.FromRealReg(sp, regalloc.RegTypeInt)
+	fpVReg     = regalloc.FromRealReg(fp, regalloc.RegTypeInt)
+)
+
+var regNames = [...]string{
+	x0:  "x0",
+	x1:  "x1",
+	x2:  "x2",
+	x3:  "x3",
+	x4:  "x4",
+	x5:  "x5",
+	x6:  "x6",
+	x7:  "x7",
+	x8:  "x8",
+	x9:  "x9",
+	x10: "x10",
+	x11: "x11",
+	x12: "x12",
+	x13: "x13",
+	x14: "x14",
+	x15: "x15",
+	x16: "x16",
+	x17: "x17",
+	x18: "x18",
+	x19: "x19",
+	x20: "x20",
+	x21: "x21",
+	x22: "x22",
+	x23: "x23",
+	x24: "x24",
+	x25: "x25",
+	x26: "x26",
+	x27: "x27",
+	x28: "x28",
+	x29: "x29",
+	x30: "x30",
+	xzr: "xzr",
+	sp:  "sp",
+	v0:  "v0",
+	v1:  "v1",
+	v2:  "v2",
+	v3:  "v3",
+	v4:  "v4",
+	v5:  "v5",
+	v6:  "v6",
+	v7:  "v7",
+	v8:  "v8",
+	v9:  "v9",
+	v10: "v10",
+	v11: "v11",
+	v12: "v12",
+	v13: "v13",
+	v14: "v14",
+	v15: "v15",
+	v16: "v16",
+	v17: "v17",
+	v18: "v18",
+	v19: "v19",
+	v20: "v20",
+	v21: "v21",
+	v22: "v22",
+	v23: "v23",
+	v24: "v24",
+	v25: "v25",
+	v26: "v26",
+	v27: "v27",
+	v28: "v28",
+	v29: "v29",
+	v30: "v30",
+	v31: "v31",
+}
+
+func formatVRegSized(r regalloc.VReg, size byte) (ret string) {
+	if r.IsRealReg() {
+		ret = regNames[r.RealReg()]
+		switch ret[0] {
+		case 'x':
+			switch size {
+			case 32:
+				ret = strings.Replace(ret, "x", "w", 1)
+			case 64:
+			default:
+				panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
+			}
+		case 'v':
+			switch size {
+			case 32:
+				ret = strings.Replace(ret, "v", "s", 1)
+			case 64:
+				ret = strings.Replace(ret, "v", "d", 1)
+			case 128:
+				ret = strings.Replace(ret, "v", "q", 1)
+			default:
+				panic("BUG: invalid register size")
+			}
+		}
+	} else {
+		switch r.RegType() {
+		case regalloc.RegTypeInt:
+			switch size {
+			case 32:
+				ret = fmt.Sprintf("w%d?", r.ID())
+			case 64:
+				ret = fmt.Sprintf("x%d?", r.ID())
+			default:
+				panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
+			}
+		case regalloc.RegTypeFloat:
+			switch size {
+			case 32:
+				ret = fmt.Sprintf("s%d?", r.ID())
+			case 64:
+				ret = fmt.Sprintf("d%d?", r.ID())
+			case 128:
+				ret = fmt.Sprintf("q%d?", r.ID())
+			default:
+				panic("BUG: invalid register size")
+			}
+		default:
+			panic(fmt.Sprintf("BUG: invalid register type: %d for %s", r.RegType(), r))
+		}
+	}
+	return
+}
+
+func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) {
+	var id string
+	wspec := strings.ToLower(width.String())
+	if r.IsRealReg() {
+		id = regNames[r.RealReg()][1:]
+	} else {
+		id = fmt.Sprintf("%d?", r.ID())
+	}
+	ret = fmt.Sprintf("%s%s", wspec, id)
+	return
+}
+
+func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) {
+	id := fmt.Sprintf("v%d?", r.ID())
+	if r.IsRealReg() {
+		id = regNames[r.RealReg()]
+	}
+	ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String()))
+	if index != vecIndexNone {
+		ret += fmt.Sprintf("[%d]", index)
+	}
+	return
+}
+
+func regTypeToRegisterSizeInBits(r regalloc.RegType) byte {
+	switch r {
+	case regalloc.RegTypeInt:
+		return 64
+	case regalloc.RegTypeFloat:
+		return 128
+	default:
+		panic("BUG: invalid register type")
+	}
+}
+
+var regNumberInEncoding = [...]uint32{
+	x0:  0,
+	x1:  1,
+	x2:  2,
+	x3:  3,
+	x4:  4,
+	x5:  5,
+	x6:  6,
+	x7:  7,
+	x8:  8,
+	x9:  9,
+	x10: 10,
+	x11: 11,
+	x12: 12,
+	x13: 13,
+	x14: 14,
+	x15: 15,
+	x16: 16,
+	x17: 17,
+	x18: 18,
+	x19: 19,
+	x20: 20,
+	x21: 21,
+	x22: 22,
+	x23: 23,
+	x24: 24,
+	x25: 25,
+	x26: 26,
+	x27: 27,
+	x28: 28,
+	x29: 29,
+	x30: 30,
+	xzr: 31,
+	sp:  31,
+	v0:  0,
+	v1:  1,
+	v2:  2,
+	v3:  3,
+	v4:  4,
+	v5:  5,
+	v6:  6,
+	v7:  7,
+	v8:  8,
+	v9:  9,
+	v10: 10,
+	v11: 11,
+	v12: 12,
+	v13: 13,
+	v14: 14,
+	v15: 15,
+	v16: 16,
+	v17: 17,
+	v18: 18,
+	v19: 19,
+	v20: 20,
+	v21: 21,
+	v22: 22,
+	v23: 23,
+	v24: 24,
+	v25: 25,
+	v26: 26,
+	v27: 27,
+	v28: 28,
+	v29: 29,
+	v30: 30,
+	v31: 31,
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/isa/arm64/unwind_stack.go
@ -0,0 +1,90 @@
+package arm64
+
+import (
+	"encoding/binary"
+	"reflect"
+	"unsafe"
+
+	"github.com/tetratelabs/wazero/internal/wasmdebug"
+)
+
+// UnwindStack implements wazevo.unwindStack.
+func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr {
+	l := int(top - sp)
+
+	var stackBuf []byte
+	{
+		// TODO: use unsafe.Slice after floor version is set to Go 1.20.
+		hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
+		hdr.Data = sp
+		hdr.Len = l
+		hdr.Cap = l
+	}
+
+	for i := uint64(0); i < uint64(l); {
+		//       (high address)
+		//    +-----------------+
+		//    |     .......     |
+		//    |      ret Y      |  <----+
+		//    |     .......     |       |
+		//    |      ret 0      |       |
+		//    |      arg X      |       |  size_of_arg_ret
+		//    |     .......     |       |
+		//    |      arg 1      |       |
+		//    |      arg 0      |  <----+
+		//    | size_of_arg_ret |
+		//    |  ReturnAddress  |
+		//    +-----------------+ <----+
+		//    |   ...........   |      |
+		//    |   spill slot M  |      |
+		//    |   ............  |      |
+		//    |   spill slot 2  |      |
+		//    |   spill slot 1  |      | frame size
+		//    |   spill slot 1  |      |
+		//    |   clobbered N   |      |
+		//    |   ............  |      |
+		//    |   clobbered 0   | <----+
+		//    |     xxxxxx      |  ;; unused space to make it 16-byte aligned.
+		//    |   frame_size    |
+		//    +-----------------+ <---- SP
+		//       (low address)
+
+		frameSize := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += frameSize +
+			16 // frame size + aligned space.
+		retAddr := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += 8 // ret addr.
+		sizeOfArgRet := binary.LittleEndian.Uint64(stackBuf[i:])
+		i += 8 + sizeOfArgRet
+		returnAddresses = append(returnAddresses, uintptr(retAddr))
+		if len(returnAddresses) == wasmdebug.MaxFrames {
+			break
+		}
+	}
+	return returnAddresses
+}
+
+// GoCallStackView implements wazevo.goCallStackView.
+func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
+	//                  (high address)
+	//              +-----------------+ <----+
+	//              |   xxxxxxxxxxx   |      | ;; optional unused space to make it 16-byte aligned.
+	//           ^  |  arg[N]/ret[M]  |      |
+	// sliceSize |  |  ............   |      | sliceSize
+	//           |  |  arg[1]/ret[1]  |      |
+	//           v  |  arg[0]/ret[0]  | <----+
+	//              |    sliceSize    |
+	//              |   frame_size    |
+	//              +-----------------+ <---- stackPointerBeforeGoCall
+	//                 (low address)
+	ptr := unsafe.Pointer(stackPointerBeforeGoCall)
+	size := *(*uint64)(unsafe.Add(ptr, 8))
+	var view []uint64
+	{
+		sh := (*reflect.SliceHeader)(unsafe.Pointer(&view))
+		sh.Data = uintptr(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize).
+		sh.Len = int(size)
+		sh.Cap = int(size)
+	}
+	return view
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/machine.go
@ -0,0 +1,100 @@
+package backend
+
+import (
+	"context"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
+type (
+	// Machine is a backend for a specific ISA machine.
+	Machine interface {
+		ExecutableContext() ExecutableContext
+
+		// DisableStackCheck disables the stack check for the current compilation for debugging/testing.
+		DisableStackCheck()
+
+		// SetCurrentABI initializes the FunctionABI for the given signature.
+		SetCurrentABI(abi *FunctionABI)
+
+		// SetCompiler sets the compilation context used for the lifetime of Machine.
+		// This is only called once per Machine, i.e. before the first compilation.
+		SetCompiler(Compiler)
+
+		// LowerSingleBranch is called when the compilation of the given single branch is started.
+		LowerSingleBranch(b *ssa.Instruction)
+
+		// LowerConditionalBranch is called when the compilation of the given conditional branch is started.
+		LowerConditionalBranch(b *ssa.Instruction)
+
+		// LowerInstr is called for each instruction in the given block except for the ones marked as already lowered
+		// via Compiler.MarkLowered. The order is reverse, i.e. from the last instruction to the first one.
+		//
+		// Note: this can lower multiple instructions (which produce the inputs) at once whenever it's possible
+		// for optimization.
+		LowerInstr(*ssa.Instruction)
+
+		// Reset resets the machine state for the next compilation.
+		Reset()
+
+		// InsertMove inserts a move instruction from src to dst whose type is typ.
+		InsertMove(dst, src regalloc.VReg, typ ssa.Type)
+
+		// InsertReturn inserts the return instruction to return from the current function.
+		InsertReturn()
+
+		// InsertLoadConstantBlockArg inserts the instruction(s) to load the constant value into the given regalloc.VReg.
+		InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg)
+
+		// Format returns the string representation of the currently compiled machine code.
+		// This is only for testing purpose.
+		Format() string
+
+		// RegAlloc does the register allocation after lowering.
+		RegAlloc()
+
+		// PostRegAlloc does the post register allocation, e.g. setting up prologue/epilogue, redundant move elimination, etc.
+		PostRegAlloc()
+
+		// ResolveRelocations resolves the relocations after emitting machine code.
+		//  * refToBinaryOffset: the map from the function reference (ssa.FuncRef) to the executable offset.
+		//  * executable: the binary to resolve the relocations.
+		//  * relocations: the relocations to resolve.
+		//  * callTrampolineIslandOffsets: the offsets of the trampoline islands in the executable.
+		ResolveRelocations(
+			refToBinaryOffset []int,
+			executable []byte,
+			relocations []RelocationInfo,
+			callTrampolineIslandOffsets []int,
+		)
+
+		// Encode encodes the machine instructions to the Compiler.
+		Encode(ctx context.Context) error
+
+		// CompileGoFunctionTrampoline compiles the trampoline function  to call a Go function of the given exit code and signature.
+		CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte
+
+		// CompileStackGrowCallSequence returns the sequence of instructions shared by all functions to
+		// call the stack grow builtin function.
+		CompileStackGrowCallSequence() []byte
+
+		// CompileEntryPreamble returns the sequence of instructions shared by multiple functions to
+		// enter the function from Go.
+		CompileEntryPreamble(signature *ssa.Signature) []byte
+
+		// LowerParams lowers the given parameters.
+		LowerParams(params []ssa.Value)
+
+		// LowerReturns lowers the given returns.
+		LowerReturns(returns []ssa.Value)
+
+		// ArgsResultsRegs returns the registers used for arguments and return values.
+		ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg)
+
+		// CallTrampolineIslandInfo returns the interval of the offset where the trampoline island is placed, and
+		// the size of the trampoline island. If islandSize is zero, the trampoline island is not used on this machine.
+		CallTrampolineIslandInfo(numFunctions int) (interval, islandSize int, err error)
+	}
+)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc.go
@ -0,0 +1,319 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// RegAllocFunctionMachine is the interface for the machine specific logic that will be used in RegAllocFunction.
+type RegAllocFunctionMachine[I regalloc.InstrConstraint] interface {
+	// InsertMoveBefore inserts the move instruction from src to dst before the given instruction.
+	InsertMoveBefore(dst, src regalloc.VReg, instr I)
+	// InsertStoreRegisterAt inserts the instruction(s) to store the given virtual register at the given instruction.
+	// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
+	InsertStoreRegisterAt(v regalloc.VReg, instr I, after bool) I
+	// InsertReloadRegisterAt inserts the instruction(s) to reload the given virtual register at the given instruction.
+	// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
+	InsertReloadRegisterAt(v regalloc.VReg, instr I, after bool) I
+	// ClobberedRegisters is called when the register allocation is done and the clobbered registers are known.
+	ClobberedRegisters(regs []regalloc.VReg)
+	// Swap swaps the two virtual registers after the given instruction.
+	Swap(cur I, x1, x2, tmp regalloc.VReg)
+	// LastInstrForInsertion implements LastInstrForInsertion of regalloc.Function. See its comment for details.
+	LastInstrForInsertion(begin, end I) I
+	// SSABlockLabel returns the label of the given ssa.BasicBlockID.
+	SSABlockLabel(id ssa.BasicBlockID) Label
+}
+
+type (
+	// RegAllocFunction implements regalloc.Function.
+	RegAllocFunction[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
+		m   m
+		ssb ssa.Builder
+		c   Compiler
+		// iter is the iterator for reversePostOrderBlocks
+		iter                   int
+		reversePostOrderBlocks []RegAllocBlock[I, m]
+		// labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
+		labelToRegAllocBlockIndex map[Label]int
+		loopNestingForestRoots    []ssa.BasicBlock
+	}
+
+	// RegAllocBlock implements regalloc.Block.
+	RegAllocBlock[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
+		// f is the function this instruction belongs to. Used to reuse the regAllocFunctionImpl.predsSlice slice for Defs() and Uses().
+		f                           *RegAllocFunction[I, m]
+		sb                          ssa.BasicBlock
+		l                           Label
+		begin, end                  I
+		loopNestingForestChildren   []ssa.BasicBlock
+		cur                         I
+		id                          int
+		cachedLastInstrForInsertion I
+	}
+)
+
+// NewRegAllocFunction returns a new RegAllocFunction.
+func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] {
+	return &RegAllocFunction[I, M]{
+		m:                         m,
+		ssb:                       ssb,
+		c:                         c,
+		labelToRegAllocBlockIndex: make(map[Label]int),
+	}
+}
+
+// AddBlock adds a new block to the function.
+func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end I) {
+	i := len(f.reversePostOrderBlocks)
+	f.reversePostOrderBlocks = append(f.reversePostOrderBlocks, RegAllocBlock[I, M]{
+		f:     f,
+		sb:    sb,
+		l:     l,
+		begin: begin,
+		end:   end,
+		id:    int(sb.ID()),
+	})
+	f.labelToRegAllocBlockIndex[l] = i
+}
+
+// Reset resets the function for the next compilation.
+func (f *RegAllocFunction[I, M]) Reset() {
+	f.reversePostOrderBlocks = f.reversePostOrderBlocks[:0]
+	f.iter = 0
+}
+
+// StoreRegisterAfter implements regalloc.Function StoreRegisterAfter.
+func (f *RegAllocFunction[I, M]) StoreRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertStoreRegisterAt(v, instr.(I), true)
+}
+
+// ReloadRegisterBefore implements regalloc.Function ReloadRegisterBefore.
+func (f *RegAllocFunction[I, M]) ReloadRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertReloadRegisterAt(v, instr.(I), false)
+}
+
+// ReloadRegisterAfter implements regalloc.Function ReloadRegisterAfter.
+func (f *RegAllocFunction[I, M]) ReloadRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertReloadRegisterAt(v, instr.(I), true)
+}
+
+// StoreRegisterBefore implements regalloc.Function StoreRegisterBefore.
+func (f *RegAllocFunction[I, M]) StoreRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
+	m := f.m
+	m.InsertStoreRegisterAt(v, instr.(I), false)
+}
+
+// ClobberedRegisters implements regalloc.Function ClobberedRegisters.
+func (f *RegAllocFunction[I, M]) ClobberedRegisters(regs []regalloc.VReg) {
+	f.m.ClobberedRegisters(regs)
+}
+
+// SwapBefore implements regalloc.Function SwapBefore.
+func (f *RegAllocFunction[I, M]) SwapBefore(x1, x2, tmp regalloc.VReg, instr regalloc.Instr) {
+	f.m.Swap(instr.Prev().(I), x1, x2, tmp)
+}
+
+// PostOrderBlockIteratorBegin implements regalloc.Function PostOrderBlockIteratorBegin.
+func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorBegin() regalloc.Block {
+	f.iter = len(f.reversePostOrderBlocks) - 1
+	return f.PostOrderBlockIteratorNext()
+}
+
+// PostOrderBlockIteratorNext implements regalloc.Function PostOrderBlockIteratorNext.
+func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorNext() regalloc.Block {
+	if f.iter < 0 {
+		return nil
+	}
+	b := &f.reversePostOrderBlocks[f.iter]
+	f.iter--
+	return b
+}
+
+// ReversePostOrderBlockIteratorBegin implements regalloc.Function ReversePostOrderBlockIteratorBegin.
+func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorBegin() regalloc.Block {
+	f.iter = 0
+	return f.ReversePostOrderBlockIteratorNext()
+}
+
+// ReversePostOrderBlockIteratorNext implements regalloc.Function ReversePostOrderBlockIteratorNext.
+func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorNext() regalloc.Block {
+	if f.iter >= len(f.reversePostOrderBlocks) {
+		return nil
+	}
+	b := &f.reversePostOrderBlocks[f.iter]
+	f.iter++
+	return b
+}
+
+// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots.
+func (f *RegAllocFunction[I, M]) LoopNestingForestRoots() int {
+	f.loopNestingForestRoots = f.ssb.LoopNestingForestRoots()
+	return len(f.loopNestingForestRoots)
+}
+
+// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot.
+func (f *RegAllocFunction[I, M]) LoopNestingForestRoot(i int) regalloc.Block {
+	blk := f.loopNestingForestRoots[i]
+	l := f.m.SSABlockLabel(blk.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// InsertMoveBefore implements regalloc.Function InsertMoveBefore.
+func (f *RegAllocFunction[I, M]) InsertMoveBefore(dst, src regalloc.VReg, instr regalloc.Instr) {
+	f.m.InsertMoveBefore(dst, src, instr.(I))
+}
+
+// LowestCommonAncestor implements regalloc.Function LowestCommonAncestor.
+func (f *RegAllocFunction[I, M]) LowestCommonAncestor(blk1, blk2 regalloc.Block) regalloc.Block {
+	ret := f.ssb.LowestCommonAncestor(blk1.(*RegAllocBlock[I, M]).sb, blk2.(*RegAllocBlock[I, M]).sb)
+	l := f.m.SSABlockLabel(ret.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// Idom implements regalloc.Function Idom.
+func (f *RegAllocFunction[I, M]) Idom(blk regalloc.Block) regalloc.Block {
+	builder := f.ssb
+	idom := builder.Idom(blk.(*RegAllocBlock[I, M]).sb)
+	if idom == nil {
+		panic("BUG: idom must not be nil")
+	}
+	l := f.m.SSABlockLabel(idom.ID())
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// ID implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) ID() int32 { return int32(r.id) }
+
+// BlockParams implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) BlockParams(regs *[]regalloc.VReg) []regalloc.VReg {
+	c := r.f.c
+	*regs = (*regs)[:0]
+	for i := 0; i < r.sb.Params(); i++ {
+		v := c.VRegOf(r.sb.Param(i))
+		*regs = append(*regs, v)
+	}
+	return *regs
+}
+
+// InstrIteratorBegin implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrIteratorBegin() regalloc.Instr {
+	r.cur = r.begin
+	return r.cur
+}
+
+// InstrIteratorNext implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrIteratorNext() regalloc.Instr {
+	for {
+		if r.cur == r.end {
+			return nil
+		}
+		instr := r.cur.Next()
+		r.cur = instr.(I)
+		if instr == nil {
+			return nil
+		} else if instr.AddedBeforeRegAlloc() {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// InstrRevIteratorBegin implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrRevIteratorBegin() regalloc.Instr {
+	r.cur = r.end
+	return r.cur
+}
+
+// InstrRevIteratorNext implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) InstrRevIteratorNext() regalloc.Instr {
+	for {
+		if r.cur == r.begin {
+			return nil
+		}
+		instr := r.cur.Prev()
+		r.cur = instr.(I)
+		if instr == nil {
+			return nil
+		} else if instr.AddedBeforeRegAlloc() {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
+// FirstInstr implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) FirstInstr() regalloc.Instr {
+	return r.begin
+}
+
+// EndInstr implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) EndInstr() regalloc.Instr {
+	return r.end
+}
+
+// LastInstrForInsertion implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LastInstrForInsertion() regalloc.Instr {
+	var nil I
+	if r.cachedLastInstrForInsertion == nil {
+		r.cachedLastInstrForInsertion = r.f.m.LastInstrForInsertion(r.begin, r.end)
+	}
+	return r.cachedLastInstrForInsertion
+}
+
+// Preds implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Preds() int { return r.sb.Preds() }
+
+// Pred implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Pred(i int) regalloc.Block {
+	sb := r.sb
+	pred := sb.Pred(i)
+	l := r.f.m.SSABlockLabel(pred.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
+
+// Entry implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Entry() bool { return r.sb.EntryBlock() }
+
+// Succs implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Succs() int {
+	return r.sb.Succs()
+}
+
+// Succ implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) Succ(i int) regalloc.Block {
+	sb := r.sb
+	succ := sb.Succ(i)
+	if succ.ReturnBlock() {
+		return nil
+	}
+	l := r.f.m.SSABlockLabel(succ.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
+
+// LoopHeader implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopHeader() bool {
+	return r.sb.LoopHeader()
+}
+
+// LoopNestingForestChildren implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopNestingForestChildren() int {
+	r.loopNestingForestChildren = r.sb.LoopNestingForestChildren()
+	return len(r.loopNestingForestChildren)
+}
+
+// LoopNestingForestChild implements regalloc.Block.
+func (r *RegAllocBlock[I, m]) LoopNestingForestChild(i int) regalloc.Block {
+	blk := r.loopNestingForestChildren[i]
+	l := r.f.m.SSABlockLabel(blk.ID())
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/api.go
@ -0,0 +1,136 @@
+package regalloc
+
+import "fmt"
+
+// These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register
+// allocators to work on any ISA.
+//
+// TODO: the interfaces are not stabilized yet, especially x64 will need some changes. E.g. x64 has an addressing mode
+// 	where index can be in memory. That kind of info will be useful to reduce the register pressure, and should be leveraged
+// 	by the register allocators, like https://docs.rs/regalloc2/latest/regalloc2/enum.OperandConstraint.html
+
+type (
+	// Function is the top-level interface to do register allocation, which corresponds to a CFG containing
+	// Blocks(s).
+	Function interface {
+		// PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG.
+		// In other words, the last blocks in the CFG will be returned first.
+		PostOrderBlockIteratorBegin() Block
+		// PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG.
+		PostOrderBlockIteratorNext() Block
+		// ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG.
+		// In other words, the first blocks in the CFG will be returned first.
+		ReversePostOrderBlockIteratorBegin() Block
+		// ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG.
+		ReversePostOrderBlockIteratorNext() Block
+		// ClobberedRegisters tell the clobbered registers by this function.
+		ClobberedRegisters([]VReg)
+		// LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function.
+		LoopNestingForestRoots() int
+		// LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function.
+		LoopNestingForestRoot(i int) Block
+		// LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree.
+		LowestCommonAncestor(blk1, blk2 Block) Block
+		// Idom returns the immediate dominator of the given block.
+		Idom(blk Block) Block
+
+		// Followings are for rewriting the function.
+
+		// SwapAtEndOfBlock swaps the two virtual registers at the end of the given block.
+		SwapBefore(x1, x2, tmp VReg, instr Instr)
+		// StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register.
+		StoreRegisterBefore(v VReg, instr Instr)
+		// StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register.
+		StoreRegisterAfter(v VReg, instr Instr)
+		// ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register.
+		ReloadRegisterBefore(v VReg, instr Instr)
+		// ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register.
+		ReloadRegisterAfter(v VReg, instr Instr)
+		// InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers.
+		InsertMoveBefore(dst, src VReg, instr Instr)
+	}
+
+	// Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s).
+	Block interface {
+		// ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG.
+		ID() int32
+		// BlockParams returns the virtual registers used as the parameters of this block.
+		BlockParams(*[]VReg) []VReg
+		// InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped.
+		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
+		InstrIteratorBegin() Instr
+		// InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped.
+		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
+		InstrIteratorNext() Instr
+		// InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order.
+		InstrRevIteratorBegin() Instr
+		// InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order.
+		InstrRevIteratorNext() Instr
+		// FirstInstr returns the fist instruction in this block where instructions will be inserted after it.
+		FirstInstr() Instr
+		// EndInstr returns the end instruction in this block.
+		EndInstr() Instr
+		// LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it.
+		// Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges.
+		// At the time of register allocation, all the critical edges are already split, so there is no need
+		// to worry about the case where branching instruction has multiple successors.
+		// Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns
+		// the unconditional branch, not the nop. In other words it is either nop or unconditional branch.
+		LastInstrForInsertion() Instr
+		// Preds returns the number of predecessors of this block in the CFG.
+		Preds() int
+		// Pred returns the i-th predecessor of this block in the CFG.
+		Pred(i int) Block
+		// Entry returns true if the block is for the entry block.
+		Entry() bool
+		// Succs returns the number of successors of this block in the CFG.
+		Succs() int
+		// Succ returns the i-th successor of this block in the CFG.
+		Succ(i int) Block
+		// LoopHeader returns true if this block is a loop header.
+		LoopHeader() bool
+		// LoopNestingForestChildren returns the number of children of this block in the loop nesting forest.
+		LoopNestingForestChildren() int
+		// LoopNestingForestChild returns the i-th child of this block in the loop nesting forest.
+		LoopNestingForestChild(i int) Block
+	}
+
+	// Instr is an instruction in a block, abstracting away the underlying ISA.
+	Instr interface {
+		fmt.Stringer
+		// Next returns the next instruction in the same block.
+		Next() Instr
+		// Prev returns the previous instruction in the same block.
+		Prev() Instr
+		// Defs returns the virtual registers defined by this instruction.
+		Defs(*[]VReg) []VReg
+		// Uses returns the virtual registers used by this instruction.
+		// Note: multiple returned []VReg will not be held at the same time, so it's safe to use the same slice for this.
+		Uses(*[]VReg) []VReg
+		// AssignUse assigns the RealReg-allocated virtual register used by this instruction at the given index.
+		AssignUse(index int, v VReg)
+		// AssignDef assigns a RealReg-allocated virtual register defined by this instruction.
+		// This only accepts one register because we don't allocate registers for multi-def instructions (i.e. call instruction)
+		AssignDef(VReg)
+		// IsCopy returns true if this instruction is a move instruction between two registers.
+		// If true, the instruction is of the form of dst = src, and if the src and dst do not interfere with each other,
+		// we could coalesce them, and hence the copy can be eliminated from the final code.
+		IsCopy() bool
+		// IsCall returns true if this instruction is a call instruction. The result is used to insert
+		// caller saved register spills and restores.
+		IsCall() bool
+		// IsIndirectCall returns true if this instruction is an indirect call instruction which calls a function pointer.
+		//  The result is used to insert caller saved register spills and restores.
+		IsIndirectCall() bool
+		// IsReturn returns true if this instruction is a return instruction.
+		IsReturn() bool
+		// AddedBeforeRegAlloc returns true if this instruction is added before register allocation.
+		AddedBeforeRegAlloc() bool
+	}
+
+	// InstrConstraint is an interface for arch-specific instruction constraints.
+	InstrConstraint interface {
+		comparable
+		Instr
+	}
+)
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/reg.go
@ -0,0 +1,123 @@
+package regalloc
+
+import (
+	"fmt"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// VReg represents a register which is assigned to an SSA value. This is used to represent a register in the backend.
+// A VReg may or may not be a physical register, and the info of physical register can be obtained by RealReg.
+type VReg uint64
+
+// VRegID is the lower 32bit of VReg, which is the pure identifier of VReg without RealReg info.
+type VRegID uint32
+
+// RealReg returns the RealReg of this VReg.
+func (v VReg) RealReg() RealReg {
+	return RealReg(v >> 32)
+}
+
+// IsRealReg returns true if this VReg is backed by a physical register.
+func (v VReg) IsRealReg() bool {
+	return v.RealReg() != RealRegInvalid
+}
+
+// FromRealReg returns a VReg from the given RealReg and RegType.
+// This is used to represent a specific pre-colored register in the backend.
+func FromRealReg(r RealReg, typ RegType) VReg {
+	rid := VRegID(r)
+	if rid > vRegIDReservedForRealNum {
+		panic(fmt.Sprintf("invalid real reg %d", r))
+	}
+	return VReg(r).SetRealReg(r).SetRegType(typ)
+}
+
+// SetRealReg sets the RealReg of this VReg and returns the updated VReg.
+func (v VReg) SetRealReg(r RealReg) VReg {
+	return VReg(r)<<32 | (v & 0xff_00_ffffffff)
+}
+
+// RegType returns the RegType of this VReg.
+func (v VReg) RegType() RegType {
+	return RegType(v >> 40)
+}
+
+// SetRegType sets the RegType of this VReg and returns the updated VReg.
+func (v VReg) SetRegType(t RegType) VReg {
+	return VReg(t)<<40 | (v & 0x00_ff_ffffffff)
+}
+
+// ID returns the VRegID of this VReg.
+func (v VReg) ID() VRegID {
+	return VRegID(v & 0xffffffff)
+}
+
+// Valid returns true if this VReg is Valid.
+func (v VReg) Valid() bool {
+	return v.ID() != vRegIDInvalid && v.RegType() != RegTypeInvalid
+}
+
+// RealReg represents a physical register.
+type RealReg byte
+
+const RealRegInvalid RealReg = 0
+
+const (
+	vRegIDInvalid            VRegID = 1 << 31
+	VRegIDNonReservedBegin          = vRegIDReservedForRealNum
+	vRegIDReservedForRealNum VRegID = 128
+	VRegInvalid                     = VReg(vRegIDInvalid)
+)
+
+// String implements fmt.Stringer.
+func (r RealReg) String() string {
+	switch r {
+	case RealRegInvalid:
+		return "invalid"
+	default:
+		return fmt.Sprintf("r%d", r)
+	}
+}
+
+// String implements fmt.Stringer.
+func (v VReg) String() string {
+	if v.IsRealReg() {
+		return fmt.Sprintf("r%d", v.ID())
+	}
+	return fmt.Sprintf("v%d?", v.ID())
+}
+
+// RegType represents the type of a register.
+type RegType byte
+
+const (
+	RegTypeInvalid RegType = iota
+	RegTypeInt
+	RegTypeFloat
+	NumRegType
+)
+
+// String implements fmt.Stringer.
+func (r RegType) String() string {
+	switch r {
+	case RegTypeInt:
+		return "int"
+	case RegTypeFloat:
+		return "float"
+	default:
+		return "invalid"
+	}
+}
+
+// RegTypeOf returns the RegType of the given ssa.Type.
+func RegTypeOf(p ssa.Type) RegType {
+	switch p {
+	case ssa.TypeI32, ssa.TypeI64:
+		return RegTypeInt
+	case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
+		return RegTypeFloat
+	default:
+		panic("invalid type")
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regalloc.go
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc/regset.go
@ -0,0 +1,108 @@
+package regalloc
+
+import (
+	"fmt"
+	"strings"
+)
+
+// NewRegSet returns a new RegSet with the given registers.
+func NewRegSet(regs ...RealReg) RegSet {
+	var ret RegSet
+	for _, r := range regs {
+		ret = ret.add(r)
+	}
+	return ret
+}
+
+// RegSet represents a set of registers.
+type RegSet uint64
+
+func (rs RegSet) format(info *RegisterInfo) string { //nolint:unused
+	var ret []string
+	for i := 0; i < 64; i++ {
+		if rs&(1<<uint(i)) != 0 {
+			ret = append(ret, info.RealRegName(RealReg(i)))
+		}
+	}
+	return strings.Join(ret, ", ")
+}
+
+func (rs RegSet) has(r RealReg) bool {
+	return rs&(1<<uint(r)) != 0
+}
+
+func (rs RegSet) add(r RealReg) RegSet {
+	if r >= 64 {
+		return rs
+	}
+	return rs | 1<<uint(r)
+}
+
+func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
+	for i := 0; i < 64; i++ {
+		if rs&(1<<uint(i)) != 0 {
+			f(RealReg(i))
+		}
+	}
+}
+
+type regInUseSet struct {
+	set RegSet
+	vrs [64]VReg
+}
+
+func (rs *regInUseSet) reset() {
+	rs.set = 0
+	for i := range rs.vrs {
+		rs.vrs[i] = VRegInvalid
+	}
+}
+
+func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
+	var ret []string
+	for i := 0; i < 64; i++ {
+		if rs.set&(1<<uint(i)) != 0 {
+			vr := rs.vrs[i]
+			ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID()))
+		}
+	}
+	return strings.Join(ret, ", ")
+}
+
+func (rs *regInUseSet) has(r RealReg) bool {
+	if r >= 64 {
+		return false
+	}
+	return rs.set&(1<<uint(r)) != 0
+}
+
+func (rs *regInUseSet) get(r RealReg) VReg {
+	if r >= 64 {
+		return VRegInvalid
+	}
+	return rs.vrs[r]
+}
+
+func (rs *regInUseSet) remove(r RealReg) {
+	if r >= 64 {
+		return
+	}
+	rs.set &= ^(1 << uint(r))
+	rs.vrs[r] = VRegInvalid
+}
+
+func (rs *regInUseSet) add(r RealReg, vr VReg) {
+	if r >= 64 {
+		return
+	}
+	rs.set |= 1 << uint(r)
+	rs.vrs[r] = vr
+}
+
+func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) {
+	for i := 0; i < 64; i++ {
+		if rs.set&(1<<uint(i)) != 0 {
+			f(RealReg(i), rs.vrs[i])
+		}
+	}
+}
--- a/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
+++ b/vendor/github.com/tetratelabs/wazero/internal/engine/wazevo/backend/vdef.go
@ -0,0 +1,43 @@
+package backend
+
+import (
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+)
+
+// SSAValueDefinition represents a definition of an SSA value.
+type SSAValueDefinition struct {
+	// BlockParamValue is valid if Instr == nil
+	BlockParamValue ssa.Value
+
+	// BlkParamVReg is valid if Instr == nil
+	BlkParamVReg regalloc.VReg
+
+	// Instr is not nil if this is a definition from an instruction.
+	Instr *ssa.Instruction
+	// N is the index of the return value in the instr's return values list.
+	N int
+	// RefCount is the number of references to the result.
+	RefCount int
+}
+
+func (d *SSAValueDefinition) IsFromInstr() bool {
+	return d.Instr != nil
+}
+
+func (d *SSAValueDefinition) IsFromBlockParam() bool {
+	return d.Instr == nil
+}
+
+func (d *SSAValueDefinition) SSAValue() ssa.Value {
+	if d.IsFromBlockParam() {
+		return d.BlockParamValue
+	} else {
+		r, rs := d.Instr.Returns()
+		if d.N == 0 {
+			return r
+		} else {
+			return rs[d.N-1]
+		}
+	}
+}