[experiment] add alternative wasm sqlite3 implementation available via build-tag (#2863)

This allows for building GoToSocial with [SQLite transpiled to WASM](https://github.com/ncruces/go-sqlite3) and accessed through [Wazero](https://wazero.io/).
This commit is contained in:
kim
2024-05-27 15:46:15 +00:00
committed by GitHub
parent cce21c11cb
commit 1e7b32490d
398 changed files with 86174 additions and 684 deletions

View File

@ -0,0 +1,170 @@
package backend
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
type (
// FunctionABI represents the ABI information for a function which corresponds to a ssa.Signature.
FunctionABI struct {
Initialized bool
Args, Rets []ABIArg
ArgStackSize, RetStackSize int64
ArgIntRealRegs byte
ArgFloatRealRegs byte
RetIntRealRegs byte
RetFloatRealRegs byte
}
// ABIArg represents either argument or return value's location.
ABIArg struct {
// Index is the index of the argument.
Index int
// Kind is the kind of the argument.
Kind ABIArgKind
// Reg is valid if Kind == ABIArgKindReg.
// This VReg must be based on RealReg.
Reg regalloc.VReg
// Offset is valid if Kind == ABIArgKindStack.
// This is the offset from the beginning of either arg or ret stack slot.
Offset int64
// Type is the type of the argument.
Type ssa.Type
}
// ABIArgKind is the kind of ABI argument.
ABIArgKind byte
)
const (
// ABIArgKindReg represents an argument passed in a register.
ABIArgKindReg = iota
// ABIArgKindStack represents an argument passed in the stack.
ABIArgKindStack
)
// String implements fmt.Stringer.
func (a *ABIArg) String() string {
return fmt.Sprintf("args[%d]: %s", a.Index, a.Kind)
}
// String implements fmt.Stringer.
func (a ABIArgKind) String() string {
switch a {
case ABIArgKindReg:
return "reg"
case ABIArgKindStack:
return "stack"
default:
panic("BUG")
}
}
// Init initializes the abiImpl for the given signature.
func (a *FunctionABI) Init(sig *ssa.Signature, argResultInts, argResultFloats []regalloc.RealReg) {
if len(a.Rets) < len(sig.Results) {
a.Rets = make([]ABIArg, len(sig.Results))
}
a.Rets = a.Rets[:len(sig.Results)]
a.RetStackSize = a.setABIArgs(a.Rets, sig.Results, argResultInts, argResultFloats)
if argsNum := len(sig.Params); len(a.Args) < argsNum {
a.Args = make([]ABIArg, argsNum)
}
a.Args = a.Args[:len(sig.Params)]
a.ArgStackSize = a.setABIArgs(a.Args, sig.Params, argResultInts, argResultFloats)
// Gather the real registers usages in arg/return.
a.ArgIntRealRegs, a.ArgFloatRealRegs = 0, 0
a.RetIntRealRegs, a.RetFloatRealRegs = 0, 0
for i := range a.Rets {
r := &a.Rets[i]
if r.Kind == ABIArgKindReg {
if r.Type.IsInt() {
a.RetIntRealRegs++
} else {
a.RetFloatRealRegs++
}
}
}
for i := range a.Args {
arg := &a.Args[i]
if arg.Kind == ABIArgKindReg {
if arg.Type.IsInt() {
a.ArgIntRealRegs++
} else {
a.ArgFloatRealRegs++
}
}
}
a.Initialized = true
}
// setABIArgs sets the ABI arguments in the given slice. This assumes that len(s) >= len(types)
// where if len(s) > len(types), the last elements of s is for the multi-return slot.
func (a *FunctionABI) setABIArgs(s []ABIArg, types []ssa.Type, ints, floats []regalloc.RealReg) (stackSize int64) {
il, fl := len(ints), len(floats)
var stackOffset int64
intParamIndex, floatParamIndex := 0, 0
for i, typ := range types {
arg := &s[i]
arg.Index = i
arg.Type = typ
if typ.IsInt() {
if intParamIndex >= il {
arg.Kind = ABIArgKindStack
const slotSize = 8 // Align 8 bytes.
arg.Offset = stackOffset
stackOffset += slotSize
} else {
arg.Kind = ABIArgKindReg
arg.Reg = regalloc.FromRealReg(ints[intParamIndex], regalloc.RegTypeInt)
intParamIndex++
}
} else {
if floatParamIndex >= fl {
arg.Kind = ABIArgKindStack
slotSize := int64(8) // Align at least 8 bytes.
if typ.Bits() == 128 { // Vector.
slotSize = 16
}
arg.Offset = stackOffset
stackOffset += slotSize
} else {
arg.Kind = ABIArgKindReg
arg.Reg = regalloc.FromRealReg(floats[floatParamIndex], regalloc.RegTypeFloat)
floatParamIndex++
}
}
}
return stackOffset
}
func (a *FunctionABI) AlignedArgResultStackSlotSize() uint32 {
stackSlotSize := a.RetStackSize + a.ArgStackSize
// Align stackSlotSize to 16 bytes.
stackSlotSize = (stackSlotSize + 15) &^ 15
// Check overflow 32-bit.
if stackSlotSize > 0xFFFFFFFF {
panic("ABI stack slot size overflow")
}
return uint32(stackSlotSize)
}
func (a *FunctionABI) ABIInfoAsUint64() uint64 {
return uint64(a.ArgIntRealRegs)<<56 |
uint64(a.ArgFloatRealRegs)<<48 |
uint64(a.RetIntRealRegs)<<40 |
uint64(a.RetFloatRealRegs)<<32 |
uint64(a.AlignedArgResultStackSlotSize())
}
func ABIInfoFromUint64(info uint64) (argIntRealRegs, argFloatRealRegs, retIntRealRegs, retFloatRealRegs byte, stackSlotSize uint32) {
return byte(info >> 56), byte(info >> 48), byte(info >> 40), byte(info >> 32), uint32(info)
}

View File

@ -0,0 +1,3 @@
// Package backend must be free of Wasm-specific concept. In other words,
// this package must not import internal/wasm package.
package backend

View File

@ -0,0 +1,417 @@
package backend
import (
"context"
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
// NewCompiler returns a new Compiler that can generate a machine code.
func NewCompiler(ctx context.Context, mach Machine, builder ssa.Builder) Compiler {
return newCompiler(ctx, mach, builder)
}
func newCompiler(_ context.Context, mach Machine, builder ssa.Builder) *compiler {
argResultInts, argResultFloats := mach.ArgsResultsRegs()
c := &compiler{
mach: mach, ssaBuilder: builder,
nextVRegID: regalloc.VRegIDNonReservedBegin,
argResultInts: argResultInts,
argResultFloats: argResultFloats,
}
mach.SetCompiler(c)
return c
}
// Compiler is the backend of wazevo which takes ssa.Builder and Machine,
// use the information there to emit the final machine code.
type Compiler interface {
// SSABuilder returns the ssa.Builder used by this compiler.
SSABuilder() ssa.Builder
// Compile executes the following steps:
// 1. Lower()
// 2. RegAlloc()
// 3. Finalize()
// 4. Encode()
//
// Each step can be called individually for testing purpose, therefore they are exposed in this interface too.
//
// The returned byte slices are the machine code and the relocation information for the machine code.
// The caller is responsible for copying them immediately since the compiler may reuse the buffer.
Compile(ctx context.Context) (_ []byte, _ []RelocationInfo, _ error)
// Lower lowers the given ssa.Instruction to the machine-specific instructions.
Lower()
// RegAlloc performs the register allocation after Lower is called.
RegAlloc()
// Finalize performs the finalization of the compilation, including machine code emission.
// This must be called after RegAlloc.
Finalize(ctx context.Context) error
// Buf returns the buffer of the encoded machine code. This is only used for testing purpose.
Buf() []byte
BufPtr() *[]byte
// Format returns the debug string of the current state of the compiler.
Format() string
// Init initializes the internal state of the compiler for the next compilation.
Init()
// AllocateVReg allocates a new virtual register of the given type.
AllocateVReg(typ ssa.Type) regalloc.VReg
// ValueDefinition returns the definition of the given value.
ValueDefinition(ssa.Value) *SSAValueDefinition
// VRegOf returns the virtual register of the given ssa.Value.
VRegOf(value ssa.Value) regalloc.VReg
// TypeOf returns the ssa.Type of the given virtual register.
TypeOf(regalloc.VReg) ssa.Type
// MatchInstr returns true if the given definition is from an instruction with the given opcode, the current group ID,
// and a refcount of 1. That means, the instruction can be merged/swapped within the current instruction group.
MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool
// MatchInstrOneOf is the same as MatchInstr but for multiple opcodes. If it matches one of ssa.Opcode,
// this returns the opcode. Otherwise, this returns ssa.OpcodeInvalid.
//
// Note: caller should be careful to avoid excessive allocation on opcodes slice.
MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode
// AddRelocationInfo appends the relocation information for the function reference at the current buffer offset.
AddRelocationInfo(funcRef ssa.FuncRef)
// AddSourceOffsetInfo appends the source offset information for the given offset.
AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset)
// SourceOffsetInfo returns the source offset information for the current buffer offset.
SourceOffsetInfo() []SourceOffsetInfo
// EmitByte appends a byte to the buffer. Used during the code emission.
EmitByte(b byte)
// Emit4Bytes appends 4 bytes to the buffer. Used during the code emission.
Emit4Bytes(b uint32)
// Emit8Bytes appends 8 bytes to the buffer. Used during the code emission.
Emit8Bytes(b uint64)
// GetFunctionABI returns the ABI information for the given signature.
GetFunctionABI(sig *ssa.Signature) *FunctionABI
}
// RelocationInfo represents the relocation information for a call instruction.
type RelocationInfo struct {
// Offset represents the offset from the beginning of the machine code of either a function or the entire module.
Offset int64
// Target is the target function of the call instruction.
FuncRef ssa.FuncRef
}
// compiler implements Compiler.
type compiler struct {
mach Machine
currentGID ssa.InstructionGroupID
ssaBuilder ssa.Builder
// nextVRegID is the next virtual register ID to be allocated.
nextVRegID regalloc.VRegID
// ssaValueToVRegs maps ssa.ValueID to regalloc.VReg.
ssaValueToVRegs [] /* VRegID to */ regalloc.VReg
// ssaValueDefinitions maps ssa.ValueID to its definition.
ssaValueDefinitions []SSAValueDefinition
// ssaValueRefCounts is a cached list obtained by ssa.Builder.ValueRefCounts().
ssaValueRefCounts []int
// returnVRegs is the list of virtual registers that store the return values.
returnVRegs []regalloc.VReg
varEdges [][2]regalloc.VReg
varEdgeTypes []ssa.Type
constEdges []struct {
cInst *ssa.Instruction
dst regalloc.VReg
}
vRegSet []bool
vRegIDs []regalloc.VRegID
tempRegs []regalloc.VReg
tmpVals []ssa.Value
ssaTypeOfVRegID [] /* VRegID to */ ssa.Type
buf []byte
relocations []RelocationInfo
sourceOffsets []SourceOffsetInfo
// abis maps ssa.SignatureID to the ABI implementation.
abis []FunctionABI
argResultInts, argResultFloats []regalloc.RealReg
}
// SourceOffsetInfo is a data to associate the source offset with the executable offset.
type SourceOffsetInfo struct {
// SourceOffset is the source offset in the original source code.
SourceOffset ssa.SourceOffset
// ExecutableOffset is the offset in the compiled executable.
ExecutableOffset int64
}
// Compile implements Compiler.Compile.
func (c *compiler) Compile(ctx context.Context) ([]byte, []RelocationInfo, error) {
c.Lower()
if wazevoapi.PrintSSAToBackendIRLowering && wazevoapi.PrintEnabledIndex(ctx) {
fmt.Printf("[[[after lowering for %s ]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
}
if wazevoapi.DeterministicCompilationVerifierEnabled {
wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After lowering to ISA specific IR", c.Format())
}
c.RegAlloc()
if wazevoapi.PrintRegisterAllocated && wazevoapi.PrintEnabledIndex(ctx) {
fmt.Printf("[[[after regalloc for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
}
if wazevoapi.DeterministicCompilationVerifierEnabled {
wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Register Allocation", c.Format())
}
if err := c.Finalize(ctx); err != nil {
return nil, nil, err
}
if wazevoapi.PrintFinalizedMachineCode && wazevoapi.PrintEnabledIndex(ctx) {
fmt.Printf("[[[after finalize for %s]]]%s\n", wazevoapi.GetCurrentFunctionName(ctx), c.Format())
}
if wazevoapi.DeterministicCompilationVerifierEnabled {
wazevoapi.VerifyOrSetDeterministicCompilationContextValue(ctx, "After Finalization", c.Format())
}
return c.buf, c.relocations, nil
}
// RegAlloc implements Compiler.RegAlloc.
func (c *compiler) RegAlloc() {
c.mach.RegAlloc()
}
// Finalize implements Compiler.Finalize.
func (c *compiler) Finalize(ctx context.Context) error {
c.mach.PostRegAlloc()
return c.mach.Encode(ctx)
}
// setCurrentGroupID sets the current instruction group ID.
func (c *compiler) setCurrentGroupID(gid ssa.InstructionGroupID) {
c.currentGID = gid
}
// assignVirtualRegisters assigns a virtual register to each ssa.ValueID Valid in the ssa.Builder.
func (c *compiler) assignVirtualRegisters() {
builder := c.ssaBuilder
refCounts := builder.ValueRefCounts()
c.ssaValueRefCounts = refCounts
need := len(refCounts)
if need >= len(c.ssaValueToVRegs) {
c.ssaValueToVRegs = append(c.ssaValueToVRegs, make([]regalloc.VReg, need+1)...)
}
if need >= len(c.ssaValueDefinitions) {
c.ssaValueDefinitions = append(c.ssaValueDefinitions, make([]SSAValueDefinition, need+1)...)
}
for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
// First we assign a virtual register to each parameter.
for i := 0; i < blk.Params(); i++ {
p := blk.Param(i)
pid := p.ID()
typ := p.Type()
vreg := c.AllocateVReg(typ)
c.ssaValueToVRegs[pid] = vreg
c.ssaValueDefinitions[pid] = SSAValueDefinition{BlockParamValue: p, BlkParamVReg: vreg}
c.ssaTypeOfVRegID[vreg.ID()] = p.Type()
}
// Assigns each value to a virtual register produced by instructions.
for cur := blk.Root(); cur != nil; cur = cur.Next() {
r, rs := cur.Returns()
var N int
if r.Valid() {
id := r.ID()
ssaTyp := r.Type()
typ := r.Type()
vReg := c.AllocateVReg(typ)
c.ssaValueToVRegs[id] = vReg
c.ssaValueDefinitions[id] = SSAValueDefinition{
Instr: cur,
N: 0,
RefCount: refCounts[id],
}
c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
N++
}
for _, r := range rs {
id := r.ID()
ssaTyp := r.Type()
vReg := c.AllocateVReg(ssaTyp)
c.ssaValueToVRegs[id] = vReg
c.ssaValueDefinitions[id] = SSAValueDefinition{
Instr: cur,
N: N,
RefCount: refCounts[id],
}
c.ssaTypeOfVRegID[vReg.ID()] = ssaTyp
N++
}
}
}
for i, retBlk := 0, builder.ReturnBlock(); i < retBlk.Params(); i++ {
typ := retBlk.Param(i).Type()
vReg := c.AllocateVReg(typ)
c.returnVRegs = append(c.returnVRegs, vReg)
c.ssaTypeOfVRegID[vReg.ID()] = typ
}
}
// AllocateVReg implements Compiler.AllocateVReg.
func (c *compiler) AllocateVReg(typ ssa.Type) regalloc.VReg {
regType := regalloc.RegTypeOf(typ)
r := regalloc.VReg(c.nextVRegID).SetRegType(regType)
id := r.ID()
if int(id) >= len(c.ssaTypeOfVRegID) {
c.ssaTypeOfVRegID = append(c.ssaTypeOfVRegID, make([]ssa.Type, id+1)...)
}
c.ssaTypeOfVRegID[id] = typ
c.nextVRegID++
return r
}
// Init implements Compiler.Init.
func (c *compiler) Init() {
c.currentGID = 0
c.nextVRegID = regalloc.VRegIDNonReservedBegin
c.returnVRegs = c.returnVRegs[:0]
c.mach.Reset()
c.varEdges = c.varEdges[:0]
c.constEdges = c.constEdges[:0]
c.buf = c.buf[:0]
c.sourceOffsets = c.sourceOffsets[:0]
c.relocations = c.relocations[:0]
}
// ValueDefinition implements Compiler.ValueDefinition.
func (c *compiler) ValueDefinition(value ssa.Value) *SSAValueDefinition {
return &c.ssaValueDefinitions[value.ID()]
}
// VRegOf implements Compiler.VRegOf.
func (c *compiler) VRegOf(value ssa.Value) regalloc.VReg {
return c.ssaValueToVRegs[value.ID()]
}
// Format implements Compiler.Format.
func (c *compiler) Format() string {
return c.mach.Format()
}
// TypeOf implements Compiler.Format.
func (c *compiler) TypeOf(v regalloc.VReg) ssa.Type {
return c.ssaTypeOfVRegID[v.ID()]
}
// MatchInstr implements Compiler.MatchInstr.
func (c *compiler) MatchInstr(def *SSAValueDefinition, opcode ssa.Opcode) bool {
instr := def.Instr
return def.IsFromInstr() &&
instr.Opcode() == opcode &&
instr.GroupID() == c.currentGID &&
def.RefCount < 2
}
// MatchInstrOneOf implements Compiler.MatchInstrOneOf.
func (c *compiler) MatchInstrOneOf(def *SSAValueDefinition, opcodes []ssa.Opcode) ssa.Opcode {
instr := def.Instr
if !def.IsFromInstr() {
return ssa.OpcodeInvalid
}
if instr.GroupID() != c.currentGID {
return ssa.OpcodeInvalid
}
if def.RefCount >= 2 {
return ssa.OpcodeInvalid
}
opcode := instr.Opcode()
for _, op := range opcodes {
if opcode == op {
return opcode
}
}
return ssa.OpcodeInvalid
}
// SSABuilder implements Compiler .SSABuilder.
func (c *compiler) SSABuilder() ssa.Builder {
return c.ssaBuilder
}
// AddSourceOffsetInfo implements Compiler.AddSourceOffsetInfo.
func (c *compiler) AddSourceOffsetInfo(executableOffset int64, sourceOffset ssa.SourceOffset) {
c.sourceOffsets = append(c.sourceOffsets, SourceOffsetInfo{
SourceOffset: sourceOffset,
ExecutableOffset: executableOffset,
})
}
// SourceOffsetInfo implements Compiler.SourceOffsetInfo.
func (c *compiler) SourceOffsetInfo() []SourceOffsetInfo {
return c.sourceOffsets
}
// AddRelocationInfo implements Compiler.AddRelocationInfo.
func (c *compiler) AddRelocationInfo(funcRef ssa.FuncRef) {
c.relocations = append(c.relocations, RelocationInfo{
Offset: int64(len(c.buf)),
FuncRef: funcRef,
})
}
// Emit8Bytes implements Compiler.Emit8Bytes.
func (c *compiler) Emit8Bytes(b uint64) {
c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24), byte(b>>32), byte(b>>40), byte(b>>48), byte(b>>56))
}
// Emit4Bytes implements Compiler.Emit4Bytes.
func (c *compiler) Emit4Bytes(b uint32) {
c.buf = append(c.buf, byte(b), byte(b>>8), byte(b>>16), byte(b>>24))
}
// EmitByte implements Compiler.EmitByte.
func (c *compiler) EmitByte(b byte) {
c.buf = append(c.buf, b)
}
// Buf implements Compiler.Buf.
func (c *compiler) Buf() []byte {
return c.buf
}
// BufPtr implements Compiler.BufPtr.
func (c *compiler) BufPtr() *[]byte {
return &c.buf
}
func (c *compiler) GetFunctionABI(sig *ssa.Signature) *FunctionABI {
if int(sig.ID) >= len(c.abis) {
c.abis = append(c.abis, make([]FunctionABI, int(sig.ID)+1)...)
}
abi := &c.abis[sig.ID]
if abi.Initialized {
return abi
}
abi.Init(sig, c.argResultInts, c.argResultFloats)
return abi
}

View File

@ -0,0 +1,226 @@
package backend
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// Lower implements Compiler.Lower.
func (c *compiler) Lower() {
c.assignVirtualRegisters()
c.mach.SetCurrentABI(c.GetFunctionABI(c.ssaBuilder.Signature()))
c.mach.ExecutableContext().StartLoweringFunction(c.ssaBuilder.BlockIDMax())
c.lowerBlocks()
}
// lowerBlocks lowers each block in the ssa.Builder.
func (c *compiler) lowerBlocks() {
builder := c.ssaBuilder
for blk := builder.BlockIteratorReversePostOrderBegin(); blk != nil; blk = builder.BlockIteratorReversePostOrderNext() {
c.lowerBlock(blk)
}
ectx := c.mach.ExecutableContext()
// After lowering all blocks, we need to link adjacent blocks to layout one single instruction list.
var prev ssa.BasicBlock
for next := builder.BlockIteratorReversePostOrderBegin(); next != nil; next = builder.BlockIteratorReversePostOrderNext() {
if prev != nil {
ectx.LinkAdjacentBlocks(prev, next)
}
prev = next
}
}
func (c *compiler) lowerBlock(blk ssa.BasicBlock) {
mach := c.mach
ectx := mach.ExecutableContext()
ectx.StartBlock(blk)
// We traverse the instructions in reverse order because we might want to lower multiple
// instructions together.
cur := blk.Tail()
// First gather the branching instructions at the end of the blocks.
var br0, br1 *ssa.Instruction
if cur.IsBranching() {
br0 = cur
cur = cur.Prev()
if cur != nil && cur.IsBranching() {
br1 = cur
cur = cur.Prev()
}
}
if br0 != nil {
c.lowerBranches(br0, br1)
}
if br1 != nil && br0 == nil {
panic("BUG? when a block has conditional branch but doesn't end with an unconditional branch?")
}
// Now start lowering the non-branching instructions.
for ; cur != nil; cur = cur.Prev() {
c.setCurrentGroupID(cur.GroupID())
if cur.Lowered() {
continue
}
switch cur.Opcode() {
case ssa.OpcodeReturn:
rets := cur.ReturnVals()
if len(rets) > 0 {
c.mach.LowerReturns(rets)
}
c.mach.InsertReturn()
default:
mach.LowerInstr(cur)
}
ectx.FlushPendingInstructions()
}
// Finally, if this is the entry block, we have to insert copies of arguments from the real location to the VReg.
if blk.EntryBlock() {
c.lowerFunctionArguments(blk)
}
ectx.EndBlock()
}
// lowerBranches is called right after StartBlock and before any LowerInstr call if
// there are branches to the given block. br0 is the very end of the block and b1 is the before the br0 if it exists.
// At least br0 is not nil, but br1 can be nil if there's no branching before br0.
//
// See ssa.Instruction IsBranching, and the comment on ssa.BasicBlock.
func (c *compiler) lowerBranches(br0, br1 *ssa.Instruction) {
ectx := c.mach.ExecutableContext()
c.setCurrentGroupID(br0.GroupID())
c.mach.LowerSingleBranch(br0)
ectx.FlushPendingInstructions()
if br1 != nil {
c.setCurrentGroupID(br1.GroupID())
c.mach.LowerConditionalBranch(br1)
ectx.FlushPendingInstructions()
}
if br0.Opcode() == ssa.OpcodeJump {
_, args, target := br0.BranchData()
argExists := len(args) != 0
if argExists && br1 != nil {
panic("BUG: critical edge split failed")
}
if argExists && target.ReturnBlock() {
if len(args) > 0 {
c.mach.LowerReturns(args)
}
} else if argExists {
c.lowerBlockArguments(args, target)
}
}
ectx.FlushPendingInstructions()
}
func (c *compiler) lowerFunctionArguments(entry ssa.BasicBlock) {
ectx := c.mach.ExecutableContext()
c.tmpVals = c.tmpVals[:0]
for i := 0; i < entry.Params(); i++ {
p := entry.Param(i)
if c.ssaValueRefCounts[p.ID()] > 0 {
c.tmpVals = append(c.tmpVals, p)
} else {
// If the argument is not used, we can just pass an invalid value.
c.tmpVals = append(c.tmpVals, ssa.ValueInvalid)
}
}
c.mach.LowerParams(c.tmpVals)
ectx.FlushPendingInstructions()
}
// lowerBlockArguments lowers how to pass arguments to the given successor block.
func (c *compiler) lowerBlockArguments(args []ssa.Value, succ ssa.BasicBlock) {
if len(args) != succ.Params() {
panic("BUG: mismatched number of arguments")
}
c.varEdges = c.varEdges[:0]
c.varEdgeTypes = c.varEdgeTypes[:0]
c.constEdges = c.constEdges[:0]
for i := 0; i < len(args); i++ {
dst := succ.Param(i)
src := args[i]
dstReg := c.VRegOf(dst)
srcDef := c.ssaValueDefinitions[src.ID()]
if srcDef.IsFromInstr() && srcDef.Instr.Constant() {
c.constEdges = append(c.constEdges, struct {
cInst *ssa.Instruction
dst regalloc.VReg
}{cInst: srcDef.Instr, dst: dstReg})
} else {
srcReg := c.VRegOf(src)
// Even when the src=dst, insert the move so that we can keep such registers keep-alive.
c.varEdges = append(c.varEdges, [2]regalloc.VReg{srcReg, dstReg})
c.varEdgeTypes = append(c.varEdgeTypes, src.Type())
}
}
// Check if there's an overlap among the dsts and srcs in varEdges.
c.vRegIDs = c.vRegIDs[:0]
for _, edge := range c.varEdges {
src := edge[0].ID()
if int(src) >= len(c.vRegSet) {
c.vRegSet = append(c.vRegSet, make([]bool, src+1)...)
}
c.vRegSet[src] = true
c.vRegIDs = append(c.vRegIDs, src)
}
separated := true
for _, edge := range c.varEdges {
dst := edge[1].ID()
if int(dst) >= len(c.vRegSet) {
c.vRegSet = append(c.vRegSet, make([]bool, dst+1)...)
} else {
if c.vRegSet[dst] {
separated = false
break
}
}
}
for _, id := range c.vRegIDs {
c.vRegSet[id] = false // reset for the next use.
}
if separated {
// If there's no overlap, we can simply move the source to destination.
for i, edge := range c.varEdges {
src, dst := edge[0], edge[1]
c.mach.InsertMove(dst, src, c.varEdgeTypes[i])
}
} else {
// Otherwise, we allocate a temporary registers and move the source to the temporary register,
//
// First move all of them to temporary registers.
c.tempRegs = c.tempRegs[:0]
for i, edge := range c.varEdges {
src := edge[0]
typ := c.varEdgeTypes[i]
temp := c.AllocateVReg(typ)
c.tempRegs = append(c.tempRegs, temp)
c.mach.InsertMove(temp, src, typ)
}
// Then move the temporary registers to the destination.
for i, edge := range c.varEdges {
temp := c.tempRegs[i]
dst := edge[1]
c.mach.InsertMove(dst, temp, c.varEdgeTypes[i])
}
}
// Finally, move the constants.
for _, edge := range c.constEdges {
cInst, dst := edge.cInst, edge.dst
c.mach.InsertLoadConstantBlockArg(cInst, dst)
}
}

View File

@ -0,0 +1,219 @@
package backend
import (
"fmt"
"math"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
type ExecutableContext interface {
// StartLoweringFunction is called when the lowering of the given function is started.
// maximumBlockID is the maximum value of ssa.BasicBlockID existing in the function.
StartLoweringFunction(maximumBlockID ssa.BasicBlockID)
// LinkAdjacentBlocks is called after finished lowering all blocks in order to create one single instruction list.
LinkAdjacentBlocks(prev, next ssa.BasicBlock)
// StartBlock is called when the compilation of the given block is started.
// The order of this being called is the reverse post order of the ssa.BasicBlock(s) as we iterate with
// ssa.Builder BlockIteratorReversePostOrderBegin and BlockIteratorReversePostOrderEnd.
StartBlock(ssa.BasicBlock)
// EndBlock is called when the compilation of the current block is finished.
EndBlock()
// FlushPendingInstructions flushes the pending instructions to the buffer.
// This will be called after the lowering of each SSA Instruction.
FlushPendingInstructions()
}
type ExecutableContextT[Instr any] struct {
CurrentSSABlk ssa.BasicBlock
// InstrPool is the InstructionPool of instructions.
InstructionPool wazevoapi.Pool[Instr]
asNop func(*Instr)
setNext func(*Instr, *Instr)
setPrev func(*Instr, *Instr)
// RootInstr is the root instruction of the executable.
RootInstr *Instr
labelPositionPool wazevoapi.Pool[LabelPosition[Instr]]
NextLabel Label
// LabelPositions maps a label to the instructions of the region which the label represents.
LabelPositions map[Label]*LabelPosition[Instr]
OrderedBlockLabels []*LabelPosition[Instr]
// PerBlockHead and PerBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
PerBlockHead, PerBlockEnd *Instr
// PendingInstructions are the instructions which are not yet emitted into the instruction list.
PendingInstructions []*Instr
// SsaBlockIDToLabels maps an SSA block ID to the label.
SsaBlockIDToLabels []Label
}
func NewExecutableContextT[Instr any](
resetInstruction func(*Instr),
setNext func(*Instr, *Instr),
setPrev func(*Instr, *Instr),
asNop func(*Instr),
) *ExecutableContextT[Instr] {
return &ExecutableContextT[Instr]{
InstructionPool: wazevoapi.NewPool[Instr](resetInstruction),
asNop: asNop,
setNext: setNext,
setPrev: setPrev,
labelPositionPool: wazevoapi.NewPool[LabelPosition[Instr]](resetLabelPosition[Instr]),
LabelPositions: make(map[Label]*LabelPosition[Instr]),
NextLabel: LabelInvalid,
}
}
func resetLabelPosition[T any](l *LabelPosition[T]) {
*l = LabelPosition[T]{}
}
// StartLoweringFunction implements ExecutableContext.
func (e *ExecutableContextT[Instr]) StartLoweringFunction(max ssa.BasicBlockID) {
imax := int(max)
if len(e.SsaBlockIDToLabels) <= imax {
// Eagerly allocate labels for the blocks since the underlying slice will be used for the next iteration.
e.SsaBlockIDToLabels = append(e.SsaBlockIDToLabels, make([]Label, imax+1)...)
}
}
func (e *ExecutableContextT[Instr]) StartBlock(blk ssa.BasicBlock) {
e.CurrentSSABlk = blk
l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
if l == LabelInvalid {
l = e.AllocateLabel()
e.SsaBlockIDToLabels[blk.ID()] = l
}
end := e.allocateNop0()
e.PerBlockHead, e.PerBlockEnd = end, end
labelPos, ok := e.LabelPositions[l]
if !ok {
labelPos = e.AllocateLabelPosition(l)
e.LabelPositions[l] = labelPos
}
e.OrderedBlockLabels = append(e.OrderedBlockLabels, labelPos)
labelPos.Begin, labelPos.End = end, end
labelPos.SB = blk
}
// EndBlock implements ExecutableContext.
func (e *ExecutableContextT[T]) EndBlock() {
// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
e.insertAtPerBlockHead(e.allocateNop0())
l := e.SsaBlockIDToLabels[e.CurrentSSABlk.ID()]
e.LabelPositions[l].Begin = e.PerBlockHead
if e.CurrentSSABlk.EntryBlock() {
e.RootInstr = e.PerBlockHead
}
}
func (e *ExecutableContextT[T]) insertAtPerBlockHead(i *T) {
if e.PerBlockHead == nil {
e.PerBlockHead = i
e.PerBlockEnd = i
return
}
e.setNext(i, e.PerBlockHead)
e.setPrev(e.PerBlockHead, i)
e.PerBlockHead = i
}
// FlushPendingInstructions implements ExecutableContext.
func (e *ExecutableContextT[T]) FlushPendingInstructions() {
l := len(e.PendingInstructions)
if l == 0 {
return
}
for i := l - 1; i >= 0; i-- { // reverse because we lower instructions in reverse order.
e.insertAtPerBlockHead(e.PendingInstructions[i])
}
e.PendingInstructions = e.PendingInstructions[:0]
}
func (e *ExecutableContextT[T]) Reset() {
e.labelPositionPool.Reset()
e.InstructionPool.Reset()
for l := Label(0); l <= e.NextLabel; l++ {
delete(e.LabelPositions, l)
}
e.PendingInstructions = e.PendingInstructions[:0]
e.OrderedBlockLabels = e.OrderedBlockLabels[:0]
e.RootInstr = nil
e.SsaBlockIDToLabels = e.SsaBlockIDToLabels[:0]
e.PerBlockHead, e.PerBlockEnd = nil, nil
e.NextLabel = LabelInvalid
}
// AllocateLabel allocates an unused label.
func (e *ExecutableContextT[T]) AllocateLabel() Label {
e.NextLabel++
return e.NextLabel
}
func (e *ExecutableContextT[T]) AllocateLabelPosition(la Label) *LabelPosition[T] {
l := e.labelPositionPool.Allocate()
l.L = la
return l
}
func (e *ExecutableContextT[T]) GetOrAllocateSSABlockLabel(blk ssa.BasicBlock) Label {
if blk.ReturnBlock() {
return LabelReturn
}
l := e.SsaBlockIDToLabels[blk.ID()]
if l == LabelInvalid {
l = e.AllocateLabel()
e.SsaBlockIDToLabels[blk.ID()] = l
}
return l
}
func (e *ExecutableContextT[T]) allocateNop0() *T {
i := e.InstructionPool.Allocate()
e.asNop(i)
return i
}
// LinkAdjacentBlocks implements backend.Machine.
func (e *ExecutableContextT[T]) LinkAdjacentBlocks(prev, next ssa.BasicBlock) {
prevLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(prev)]
nextLabelPos := e.LabelPositions[e.GetOrAllocateSSABlockLabel(next)]
e.setNext(prevLabelPos.End, nextLabelPos.Begin)
}
// LabelPosition represents the regions of the generated code which the label represents.
type LabelPosition[Instr any] struct {
SB ssa.BasicBlock
L Label
Begin, End *Instr
BinaryOffset int64
}
// Label represents a position in the generated code which is either
// a real instruction or the constant InstructionPool (e.g. jump tables).
//
// This is exactly the same as the traditional "label" in assembly code.
type Label uint32
const (
LabelInvalid Label = 0
LabelReturn Label = math.MaxUint32
)
// String implements backend.Machine.
func (l Label) String() string {
return fmt.Sprintf("L%d", l)
}

View File

@ -0,0 +1,33 @@
package backend
import "github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
// GoFunctionCallRequiredStackSize returns the size of the stack required for the Go function call.
// argBegin is the index of the first argument in the signature which is not either execution context or module context.
func GoFunctionCallRequiredStackSize(sig *ssa.Signature, argBegin int) (ret, retUnaligned int64) {
var paramNeededInBytes, resultNeededInBytes int64
for _, p := range sig.Params[argBegin:] {
s := int64(p.Size())
if s < 8 {
s = 8 // We use uint64 for all basic types, except SIMD v128.
}
paramNeededInBytes += s
}
for _, r := range sig.Results {
s := int64(r.Size())
if s < 8 {
s = 8 // We use uint64 for all basic types, except SIMD v128.
}
resultNeededInBytes += s
}
if paramNeededInBytes > resultNeededInBytes {
ret = paramNeededInBytes
} else {
ret = resultNeededInBytes
}
retUnaligned = ret
// Align to 16 bytes.
ret = (ret + 15) &^ 15
return
}

View File

@ -0,0 +1,186 @@
package amd64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// For the details of the ABI, see:
// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#amd64-architecture
var (
intArgResultRegs = []regalloc.RealReg{rax, rbx, rcx, rdi, rsi, r8, r9, r10, r11}
floatArgResultRegs = []regalloc.RealReg{xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7}
)
var regInfo = &regalloc.RegisterInfo{
AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
regalloc.RegTypeInt: {
rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15,
},
regalloc.RegTypeFloat: {
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
},
},
CalleeSavedRegisters: regalloc.NewRegSet(
rdx, r12, r13, r14, r15,
xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
),
CallerSavedRegisters: regalloc.NewRegSet(
rax, rcx, rbx, rsi, rdi, r8, r9, r10, r11,
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
),
RealRegToVReg: []regalloc.VReg{
rax: raxVReg, rcx: rcxVReg, rdx: rdxVReg, rbx: rbxVReg, rsp: rspVReg, rbp: rbpVReg, rsi: rsiVReg, rdi: rdiVReg,
r8: r8VReg, r9: r9VReg, r10: r10VReg, r11: r11VReg, r12: r12VReg, r13: r13VReg, r14: r14VReg, r15: r15VReg,
xmm0: xmm0VReg, xmm1: xmm1VReg, xmm2: xmm2VReg, xmm3: xmm3VReg, xmm4: xmm4VReg, xmm5: xmm5VReg, xmm6: xmm6VReg,
xmm7: xmm7VReg, xmm8: xmm8VReg, xmm9: xmm9VReg, xmm10: xmm10VReg, xmm11: xmm11VReg, xmm12: xmm12VReg,
xmm13: xmm13VReg, xmm14: xmm14VReg, xmm15: xmm15VReg,
},
RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
RealRegType: func(r regalloc.RealReg) regalloc.RegType {
if r < xmm0 {
return regalloc.RegTypeInt
}
return regalloc.RegTypeFloat
},
}
// ArgsResultsRegs implements backend.Machine.
func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
return intArgResultRegs, floatArgResultRegs
}
// LowerParams implements backend.Machine.
func (m *machine) LowerParams(args []ssa.Value) {
a := m.currentABI
for i, ssaArg := range args {
if !ssaArg.Valid() {
continue
}
reg := m.c.VRegOf(ssaArg)
arg := &a.Args[i]
if arg.Kind == backend.ABIArgKindReg {
m.InsertMove(reg, arg.Reg, arg.Type)
} else {
//
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | ReturnAddress |
// | Caller_RBP |
// +-----------------+ <-- RBP
// | ........... |
// | clobbered M |
// | ............ |
// | clobbered 0 |
// | spill slot N |
// | ........... |
// | spill slot 0 |
// RSP--> +-----------------+
// (low address)
// Load the value from the arg stack slot above the current RBP.
load := m.allocateInstr()
mem := newOperandMem(m.newAmodeImmRBPReg(uint32(arg.Offset + 16)))
switch arg.Type {
case ssa.TypeI32:
load.asMovzxRmR(extModeLQ, mem, reg)
case ssa.TypeI64:
load.asMov64MR(mem, reg)
case ssa.TypeF32:
load.asXmmUnaryRmR(sseOpcodeMovss, mem, reg)
case ssa.TypeF64:
load.asXmmUnaryRmR(sseOpcodeMovsd, mem, reg)
case ssa.TypeV128:
load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, reg)
default:
panic("BUG")
}
m.insert(load)
}
}
}
// LowerReturns implements backend.Machine.
func (m *machine) LowerReturns(rets []ssa.Value) {
// Load the XMM registers first as it might need a temporary register to inline
// constant return.
a := m.currentABI
for i, ret := range rets {
r := &a.Rets[i]
if !r.Type.IsInt() {
m.LowerReturn(ret, r)
}
}
// Then load the GPR registers.
for i, ret := range rets {
r := &a.Rets[i]
if r.Type.IsInt() {
m.LowerReturn(ret, r)
}
}
}
func (m *machine) LowerReturn(ret ssa.Value, r *backend.ABIArg) {
reg := m.c.VRegOf(ret)
if def := m.c.ValueDefinition(ret); def.IsFromInstr() {
// Constant instructions are inlined.
if inst := def.Instr; inst.Constant() {
m.insertLoadConstant(inst, reg)
}
}
if r.Kind == backend.ABIArgKindReg {
m.InsertMove(r.Reg, reg, ret.Type())
} else {
//
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | ReturnAddress |
// | Caller_RBP |
// +-----------------+ <-- RBP
// | ........... |
// | clobbered M |
// | ............ |
// | clobbered 0 |
// | spill slot N |
// | ........... |
// | spill slot 0 |
// RSP--> +-----------------+
// (low address)
// Store the value to the return stack slot above the current RBP.
store := m.allocateInstr()
mem := newOperandMem(m.newAmodeImmRBPReg(uint32(m.currentABI.ArgStackSize + 16 + r.Offset)))
switch r.Type {
case ssa.TypeI32:
store.asMovRM(reg, mem, 4)
case ssa.TypeI64:
store.asMovRM(reg, mem, 8)
case ssa.TypeF32:
store.asXmmMovRM(sseOpcodeMovss, reg, mem)
case ssa.TypeF64:
store.asXmmMovRM(sseOpcodeMovsd, reg, mem)
case ssa.TypeV128:
store.asXmmMovRM(sseOpcodeMovdqu, reg, mem)
}
m.insert(store)
}
}

View File

@ -0,0 +1,9 @@
package amd64
// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
// This implements wazevo.entrypoint, and see the comments there for detail.
func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)

View File

@ -0,0 +1,29 @@
#include "funcdata.h"
#include "textflag.h"
// entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr
TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
MOVQ preambleExecutable+0(FP), R11
MOVQ functionExectuable+8(FP), R14
MOVQ executionContextPtr+16(FP), AX // First argument is passed in AX.
MOVQ moduleContextPtr+24(FP), BX // Second argument is passed in BX.
MOVQ paramResultSlicePtr+32(FP), R12
MOVQ goAllocatedStackSlicePtr+40(FP), R13
JMP R11
// afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)
TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
MOVQ executable+0(FP), CX
MOVQ executionContextPtr+8(FP), AX // First argument is passed in AX.
// Save the stack pointer and frame pointer.
MOVQ BP, 16(AX) // 16 == ExecutionContextOffsetOriginalFramePointer
MOVQ SP, 24(AX) // 24 == ExecutionContextOffsetOriginalStackPointer
// Then set the stack pointer and frame pointer to the values we got from the Go runtime.
MOVQ framePointer+24(FP), BP
// WARNING: do not update SP before BP, because the Go translates (FP) as (SP) + 8.
MOVQ stackPointer+16(FP), SP
JMP CX

View File

@ -0,0 +1,248 @@
package amd64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
var (
executionContextPtrReg = raxVReg
// Followings are callee saved registers. They can be used freely in the entry preamble
// since the preamble is called via Go assembly function which has stack-based ABI.
// savedExecutionContextPtr also must be a callee-saved reg so that they can be used in the prologue and epilogue.
savedExecutionContextPtr = rdxVReg
// paramResultSlicePtr must match with entrypoint function in abi_entry_amd64.s.
paramResultSlicePtr = r12VReg
// goAllocatedStackPtr must match with entrypoint function in abi_entry_amd64.s.
goAllocatedStackPtr = r13VReg
// functionExecutable must match with entrypoint function in abi_entry_amd64.s.
functionExecutable = r14VReg
tmpIntReg = r15VReg
tmpXmmReg = xmm15VReg
)
// CompileEntryPreamble implements backend.Machine.
func (m *machine) CompileEntryPreamble(sig *ssa.Signature) []byte {
root := m.compileEntryPreamble(sig)
m.encodeWithoutSSA(root)
buf := m.c.Buf()
return buf
}
func (m *machine) compileEntryPreamble(sig *ssa.Signature) *instruction {
abi := backend.FunctionABI{}
abi.Init(sig, intArgResultRegs, floatArgResultRegs)
root := m.allocateNop()
//// ----------------------------------- prologue ----------------------------------- ////
// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
// mov %executionContextPtrReg, %savedExecutionContextPtr
cur := m.move64(executionContextPtrReg, savedExecutionContextPtr, root)
// Next is to save the original RBP and RSP into the execution context.
cur = m.saveOriginalRSPRBP(cur)
// Now set the RSP to the Go-allocated stack pointer.
// mov %goAllocatedStackPtr, %rsp
cur = m.move64(goAllocatedStackPtr, rspVReg, cur)
if stackSlotSize := abi.AlignedArgResultStackSlotSize(); stackSlotSize > 0 {
// Allocate stack slots for the arguments and return values.
// sub $stackSlotSize, %rsp
spDec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(stackSlotSize)), rspVReg, true)
cur = linkInstr(cur, spDec)
}
var offset uint32
for i := range abi.Args {
if i < 2 {
// module context ptr and execution context ptr are passed in rax and rbx by the Go assembly function.
continue
}
arg := &abi.Args[i]
cur = m.goEntryPreamblePassArg(cur, paramResultSlicePtr, offset, arg)
if arg.Type == ssa.TypeV128 {
offset += 16
} else {
offset += 8
}
}
// Zero out RBP so that the unwind/stack growth code can correctly detect the end of the stack.
zerosRbp := m.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(rbpVReg), rbpVReg, true)
cur = linkInstr(cur, zerosRbp)
// Now ready to call the real function. Note that at this point stack pointer is already set to the Go-allocated,
// which is aligned to 16 bytes.
call := m.allocateInstr().asCallIndirect(newOperandReg(functionExecutable), &abi)
cur = linkInstr(cur, call)
//// ----------------------------------- epilogue ----------------------------------- ////
// Read the results from regs and the stack, and set them correctly into the paramResultSlicePtr.
offset = 0
for i := range abi.Rets {
r := &abi.Rets[i]
cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, offset, r, uint32(abi.ArgStackSize))
if r.Type == ssa.TypeV128 {
offset += 16
} else {
offset += 8
}
}
// Finally, restore the original RBP and RSP.
cur = m.restoreOriginalRSPRBP(cur)
ret := m.allocateInstr().asRet()
linkInstr(cur, ret)
return root
}
// saveOriginalRSPRBP saves the original RSP and RBP into the execution context.
func (m *machine) saveOriginalRSPRBP(cur *instruction) *instruction {
// mov %rbp, wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg)
// mov %rsp, wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg)
cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, true, cur)
cur = m.loadOrStore64AtExecutionCtx(executionContextPtrReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, true, cur)
return cur
}
// restoreOriginalRSPRBP restores the original RSP and RBP from the execution context.
func (m *machine) restoreOriginalRSPRBP(cur *instruction) *instruction {
// mov wazevoapi.ExecutionContextOffsetOriginalFramePointer(%executionContextPtrReg), %rbp
// mov wazevoapi.ExecutionContextOffsetOriginalStackPointer(%executionContextPtrReg), %rsp
cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalFramePointer, rbpVReg, false, cur)
cur = m.loadOrStore64AtExecutionCtx(savedExecutionContextPtr, wazevoapi.ExecutionContextOffsetOriginalStackPointer, rspVReg, false, cur)
return cur
}
func (m *machine) move64(src, dst regalloc.VReg, prev *instruction) *instruction {
mov := m.allocateInstr().asMovRR(src, dst, true)
return linkInstr(prev, mov)
}
func (m *machine) loadOrStore64AtExecutionCtx(execCtx regalloc.VReg, offset wazevoapi.Offset, r regalloc.VReg, store bool, prev *instruction) *instruction {
mem := newOperandMem(m.newAmodeImmReg(offset.U32(), execCtx))
instr := m.allocateInstr()
if store {
instr.asMovRM(r, mem, 8)
} else {
instr.asMov64MR(mem, r)
}
return linkInstr(prev, instr)
}
// This is for debugging.
func (m *machine) linkUD2(cur *instruction) *instruction { //nolint
return linkInstr(cur, m.allocateInstr().asUD2())
}
func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, offsetInParamSlice uint32, arg *backend.ABIArg) *instruction {
var dst regalloc.VReg
argTyp := arg.Type
if arg.Kind == backend.ABIArgKindStack {
// Caller saved registers ca
switch argTyp {
case ssa.TypeI32, ssa.TypeI64:
dst = tmpIntReg
case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
dst = tmpXmmReg
default:
panic("BUG")
}
} else {
dst = arg.Reg
}
load := m.allocateInstr()
a := newOperandMem(m.newAmodeImmReg(offsetInParamSlice, paramSlicePtr))
switch arg.Type {
case ssa.TypeI32:
load.asMovzxRmR(extModeLQ, a, dst)
case ssa.TypeI64:
load.asMov64MR(a, dst)
case ssa.TypeF32:
load.asXmmUnaryRmR(sseOpcodeMovss, a, dst)
case ssa.TypeF64:
load.asXmmUnaryRmR(sseOpcodeMovsd, a, dst)
case ssa.TypeV128:
load.asXmmUnaryRmR(sseOpcodeMovdqu, a, dst)
}
cur = linkInstr(cur, load)
if arg.Kind == backend.ABIArgKindStack {
// Store back to the stack.
store := m.allocateInstr()
a := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset), rspVReg))
switch arg.Type {
case ssa.TypeI32:
store.asMovRM(dst, a, 4)
case ssa.TypeI64:
store.asMovRM(dst, a, 8)
case ssa.TypeF32:
store.asXmmMovRM(sseOpcodeMovss, dst, a)
case ssa.TypeF64:
store.asXmmMovRM(sseOpcodeMovsd, dst, a)
case ssa.TypeV128:
store.asXmmMovRM(sseOpcodeMovdqu, dst, a)
}
cur = linkInstr(cur, store)
}
return cur
}
func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, offsetInResultSlice uint32, result *backend.ABIArg, resultStackSlotBeginOffset uint32) *instruction {
var r regalloc.VReg
if result.Kind == backend.ABIArgKindStack {
// Load the value to the temporary.
load := m.allocateInstr()
offset := resultStackSlotBeginOffset + uint32(result.Offset)
a := newOperandMem(m.newAmodeImmReg(offset, rspVReg))
switch result.Type {
case ssa.TypeI32:
r = tmpIntReg
load.asMovzxRmR(extModeLQ, a, r)
case ssa.TypeI64:
r = tmpIntReg
load.asMov64MR(a, r)
case ssa.TypeF32:
r = tmpXmmReg
load.asXmmUnaryRmR(sseOpcodeMovss, a, r)
case ssa.TypeF64:
r = tmpXmmReg
load.asXmmUnaryRmR(sseOpcodeMovsd, a, r)
case ssa.TypeV128:
r = tmpXmmReg
load.asXmmUnaryRmR(sseOpcodeMovdqu, a, r)
default:
panic("BUG")
}
cur = linkInstr(cur, load)
} else {
r = result.Reg
}
store := m.allocateInstr()
a := newOperandMem(m.newAmodeImmReg(offsetInResultSlice, resultSlicePtr))
switch result.Type {
case ssa.TypeI32:
store.asMovRM(r, a, 4)
case ssa.TypeI64:
store.asMovRM(r, a, 8)
case ssa.TypeF32:
store.asXmmMovRM(sseOpcodeMovss, r, a)
case ssa.TypeF64:
store.asXmmMovRM(sseOpcodeMovsd, r, a)
case ssa.TypeV128:
store.asXmmMovRM(sseOpcodeMovdqu, r, a)
}
return linkInstr(cur, store)
}

View File

@ -0,0 +1,443 @@
package amd64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
var calleeSavedVRegs = []regalloc.VReg{
rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
}
// CompileGoFunctionTrampoline implements backend.Machine.
func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
ectx := m.ectx
argBegin := 1 // Skips exec context by default.
if needModuleContextPtr {
argBegin++
}
abi := &backend.FunctionABI{}
abi.Init(sig, intArgResultRegs, floatArgResultRegs)
m.currentABI = abi
cur := m.allocateNop()
ectx.RootInstr = cur
// Execution context is always the first argument.
execCtrPtr := raxVReg
// First we update RBP and RSP just like the normal prologue.
//
// (high address) (high address)
// RBP ----> +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | ====> | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | Return Addr | | Return Addr |
// RSP ----> +-----------------+ | Caller_RBP |
// (low address) +-----------------+ <----- RSP, RBP
//
cur = m.setupRBPRSP(cur)
goSliceSizeAligned, goSliceSizeAlignedUnaligned := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
cur = m.insertStackBoundsCheck(goSliceSizeAligned+8 /* size of the Go slice */, cur)
// Save the callee saved registers.
cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
if needModuleContextPtr {
moduleCtrPtr := rbxVReg // Module context is always the second argument.
mem := m.newAmodeImmReg(
wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.U32(),
execCtrPtr)
store := m.allocateInstr().asMovRM(moduleCtrPtr, newOperandMem(mem), 8)
cur = linkInstr(cur, store)
}
// Now let's advance the RSP to the stack slot for the arguments.
//
// (high address) (high address)
// +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | =======> | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | Return Addr | | Return Addr |
// | Caller_RBP | | Caller_RBP |
// RBP,RSP --> +-----------------+ +-----------------+ <----- RBP
// (low address) | arg[N]/ret[M] |
// | .......... |
// | arg[1]/ret[1] |
// | arg[0]/ret[0] |
// +-----------------+ <----- RSP
// (low address)
//
// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
// the arguments/return values to/from Go function.
cur = m.addRSP(-int32(goSliceSizeAligned), cur)
// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
var offsetInGoSlice int32
for i := range abi.Args[argBegin:] {
arg := &abi.Args[argBegin+i]
var v regalloc.VReg
if arg.Kind == backend.ABIArgKindReg {
v = arg.Reg
} else {
// We have saved callee saved registers, so we can use them.
if arg.Type.IsInt() {
v = r15VReg
} else {
v = xmm15VReg
}
mem := newOperandMem(m.newAmodeImmReg(uint32(arg.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
load := m.allocateInstr()
switch arg.Type {
case ssa.TypeI32:
load.asMovzxRmR(extModeLQ, mem, v)
case ssa.TypeI64:
load.asMov64MR(mem, v)
case ssa.TypeF32:
load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
case ssa.TypeF64:
load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
case ssa.TypeV128:
load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
default:
panic("BUG")
}
cur = linkInstr(cur, load)
}
store := m.allocateInstr()
mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
switch arg.Type {
case ssa.TypeI32:
store.asMovRM(v, mem, 4)
offsetInGoSlice += 8 // always uint64 rep.
case ssa.TypeI64:
store.asMovRM(v, mem, 8)
offsetInGoSlice += 8
case ssa.TypeF32:
store.asXmmMovRM(sseOpcodeMovss, v, mem)
offsetInGoSlice += 8 // always uint64 rep.
case ssa.TypeF64:
store.asXmmMovRM(sseOpcodeMovsd, v, mem)
offsetInGoSlice += 8
case ssa.TypeV128:
store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
offsetInGoSlice += 16
default:
panic("BUG")
}
cur = linkInstr(cur, store)
}
// Finally we push the size of the slice to the stack so the stack looks like:
//
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | Return Addr |
// | Caller_RBP |
// +-----------------+ <----- RBP
// | arg[N]/ret[M] |
// | .......... |
// | arg[1]/ret[1] |
// | arg[0]/ret[0] |
// | slice size |
// +-----------------+ <----- RSP
// (low address)
//
// push $sliceSize
cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandImm32(uint32(goSliceSizeAlignedUnaligned))))
// Load the exitCode to the register.
exitCodeReg := r12VReg // Callee saved which is already saved.
cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(exitCode), false))
saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
cur = linkInstr(cur, setExitCode)
cur = linkInstr(cur, saveRsp)
cur = linkInstr(cur, saveRbp)
// Ready to exit the execution.
cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
// We don't need the slice size anymore, so pop it.
cur = m.addRSP(8, cur)
// Ready to set up the results.
offsetInGoSlice = 0
// To avoid overwriting with the execution context pointer by the result, we need to track the offset,
// and defer the restoration of the result to the end of this function.
var argOverlapWithExecCtxOffset int32 = -1
for i := range abi.Rets {
r := &abi.Rets[i]
var v regalloc.VReg
isRegResult := r.Kind == backend.ABIArgKindReg
if isRegResult {
v = r.Reg
if v.RealReg() == execCtrPtr.RealReg() {
argOverlapWithExecCtxOffset = offsetInGoSlice
offsetInGoSlice += 8 // always uint64 rep.
continue
}
} else {
if r.Type.IsInt() {
v = r15VReg
} else {
v = xmm15VReg
}
}
load := m.allocateInstr()
mem := newOperandMem(m.newAmodeImmReg(uint32(offsetInGoSlice), rspVReg))
switch r.Type {
case ssa.TypeI32:
load.asMovzxRmR(extModeLQ, mem, v)
offsetInGoSlice += 8 // always uint64 rep.
case ssa.TypeI64:
load.asMov64MR(mem, v)
offsetInGoSlice += 8
case ssa.TypeF32:
load.asXmmUnaryRmR(sseOpcodeMovss, mem, v)
offsetInGoSlice += 8 // always uint64 rep.
case ssa.TypeF64:
load.asXmmUnaryRmR(sseOpcodeMovsd, mem, v)
offsetInGoSlice += 8
case ssa.TypeV128:
load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
offsetInGoSlice += 16
default:
panic("BUG")
}
cur = linkInstr(cur, load)
if !isRegResult {
// We need to store it back to the result slot above rbp.
store := m.allocateInstr()
mem := newOperandMem(m.newAmodeImmReg(uint32(abi.ArgStackSize+r.Offset+16 /* to skip caller_rbp and ret_addr */), rbpVReg))
switch r.Type {
case ssa.TypeI32:
store.asMovRM(v, mem, 4)
case ssa.TypeI64:
store.asMovRM(v, mem, 8)
case ssa.TypeF32:
store.asXmmMovRM(sseOpcodeMovss, v, mem)
case ssa.TypeF64:
store.asXmmMovRM(sseOpcodeMovsd, v, mem)
case ssa.TypeV128:
store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
default:
panic("BUG")
}
cur = linkInstr(cur, store)
}
}
// Before return, we need to restore the callee saved registers.
cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, calleeSavedVRegs)
if argOverlapWithExecCtxOffset >= 0 {
// At this point execCtt is not used anymore, so we can finally store the
// result to the register which overlaps with the execution context pointer.
mem := newOperandMem(m.newAmodeImmReg(uint32(argOverlapWithExecCtxOffset), rspVReg))
load := m.allocateInstr().asMov64MR(mem, execCtrPtr)
cur = linkInstr(cur, load)
}
// Finally ready to return.
cur = m.revertRBPRSP(cur)
linkInstr(cur, m.allocateInstr().asRet())
m.encodeWithoutSSA(ectx.RootInstr)
return m.c.Buf()
}
func (m *machine) saveRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
for _, v := range regs {
store := m.allocateInstr()
mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
switch v.RegType() {
case regalloc.RegTypeInt:
store.asMovRM(v, mem, 8)
case regalloc.RegTypeFloat:
store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
default:
panic("BUG")
}
cur = linkInstr(cur, store)
offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
}
return cur
}
func (m *machine) restoreRegistersInExecutionContext(cur *instruction, execCtx regalloc.VReg, regs []regalloc.VReg) *instruction {
offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
for _, v := range regs {
load := m.allocateInstr()
mem := newOperandMem(m.newAmodeImmReg(uint32(offset), execCtx))
switch v.RegType() {
case regalloc.RegTypeInt:
load.asMov64MR(mem, v)
case regalloc.RegTypeFloat:
load.asXmmUnaryRmR(sseOpcodeMovdqu, mem, v)
default:
panic("BUG")
}
cur = linkInstr(cur, load)
offset += 16 // See execution context struct. Each register is 16 bytes-aligned unconditionally.
}
return cur
}
func (m *machine) storeReturnAddressAndExit(cur *instruction, execCtx regalloc.VReg) *instruction {
readRip := m.allocateInstr()
cur = linkInstr(cur, readRip)
ripReg := r12VReg // Callee saved which is already saved.
saveRip := m.allocateInstr().asMovRM(
ripReg,
newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), execCtx)),
8,
)
cur = linkInstr(cur, saveRip)
exit := m.allocateExitSeq(execCtx)
cur = linkInstr(cur, exit)
nop, l := m.allocateBrTarget()
cur = linkInstr(cur, nop)
readRip.asLEA(newOperandLabel(l), ripReg)
return cur
}
// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
// stack space left. Basically this is the all allocatable registers except for RSP and RBP, and RAX which contains the
// execution context pointer. ExecCtx pointer is always the first argument so we don't need to save it.
var stackGrowSaveVRegs = []regalloc.VReg{
rdxVReg, r12VReg, r13VReg, r14VReg, r15VReg,
rcxVReg, rbxVReg, rsiVReg, rdiVReg, r8VReg, r9VReg, r10VReg, r11VReg,
xmm8VReg, xmm9VReg, xmm10VReg, xmm11VReg, xmm12VReg, xmm13VReg, xmm14VReg, xmm15VReg,
xmm0VReg, xmm1VReg, xmm2VReg, xmm3VReg, xmm4VReg, xmm5VReg, xmm6VReg, xmm7VReg,
}
// CompileStackGrowCallSequence implements backend.Machine.
func (m *machine) CompileStackGrowCallSequence() []byte {
ectx := m.ectx
cur := m.allocateNop()
ectx.RootInstr = cur
cur = m.setupRBPRSP(cur)
// Execution context is always the first argument.
execCtrPtr := raxVReg
// Save the callee saved and argument registers.
cur = m.saveRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
// Load the exitCode to the register.
exitCodeReg := r12VReg // Already saved.
cur = linkInstr(cur, m.allocateInstr().asImm(exitCodeReg, uint64(wazevoapi.ExitCodeGrowStack), false))
saveRsp, saveRbp, setExitCode := m.allocateExitInstructions(execCtrPtr, exitCodeReg)
cur = linkInstr(cur, setExitCode)
cur = linkInstr(cur, saveRsp)
cur = linkInstr(cur, saveRbp)
// Ready to exit the execution.
cur = m.storeReturnAddressAndExit(cur, execCtrPtr)
// After the exit, restore the saved registers.
cur = m.restoreRegistersInExecutionContext(cur, execCtrPtr, stackGrowSaveVRegs)
// Finally ready to return.
cur = m.revertRBPRSP(cur)
linkInstr(cur, m.allocateInstr().asRet())
m.encodeWithoutSSA(ectx.RootInstr)
return m.c.Buf()
}
// insertStackBoundsCheck will insert the instructions after `cur` to check the
// stack bounds, and if there's no sufficient spaces required for the function,
// exit the execution and try growing it in Go world.
func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
// add $requiredStackSize, %rsp ;; Temporarily update the sp.
// cmp ExecutionContextOffsetStackBottomPtr(%rax), %rsp ;; Compare the stack bottom and the sp.
// ja .ok
// sub $requiredStackSize, %rsp ;; Reverse the temporary update.
// pushq r15 ;; save the temporary.
// mov $requiredStackSize, %r15
// mov %15, ExecutionContextOffsetStackGrowRequiredSize(%rax) ;; Set the required size in the execution context.
// popq r15 ;; restore the temporary.
// callq *ExecutionContextOffsetStackGrowCallTrampolineAddress(%rax) ;; Call the Go function to grow the stack.
// jmp .cont
// .ok:
// sub $requiredStackSize, %rsp ;; Reverse the temporary update.
// .cont:
cur = m.addRSP(-int32(requiredStackSize), cur)
cur = linkInstr(cur, m.allocateInstr().asCmpRmiR(true,
newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackBottomPtr.U32(), raxVReg)),
rspVReg, true))
ja := m.allocateInstr()
cur = linkInstr(cur, ja)
cur = m.addRSP(int32(requiredStackSize), cur)
// Save the temporary.
cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r15VReg)))
// Load the required size to the temporary.
cur = linkInstr(cur, m.allocateInstr().asImm(r15VReg, uint64(requiredStackSize), true))
// Set the required size in the execution context.
cur = linkInstr(cur, m.allocateInstr().asMovRM(r15VReg,
newOperandMem(m.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.U32(), raxVReg)), 8))
// Restore the temporary.
cur = linkInstr(cur, m.allocateInstr().asPop64(r15VReg))
// Call the Go function to grow the stack.
cur = linkInstr(cur, m.allocateInstr().asCallIndirect(newOperandMem(m.newAmodeImmReg(
wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.U32(), raxVReg)), nil))
// Jump to the continuation.
jmpToCont := m.allocateInstr()
cur = linkInstr(cur, jmpToCont)
// .ok:
okInstr, ok := m.allocateBrTarget()
cur = linkInstr(cur, okInstr)
ja.asJmpIf(condNBE, newOperandLabel(ok))
// On the ok path, we only need to reverse the temporary update.
cur = m.addRSP(int32(requiredStackSize), cur)
// .cont:
contInstr, cont := m.allocateBrTarget()
cur = linkInstr(cur, contInstr)
jmpToCont.asJmp(newOperandLabel(cont))
return cur
}

View File

@ -0,0 +1,168 @@
package amd64
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
type cond byte
const (
// condO represents (overflow) condition.
condO cond = iota
// condNO represents (no overflow) condition.
condNO
// condB represents (< unsigned) condition.
condB
// condNB represents (>= unsigned) condition.
condNB
// condZ represents (zero) condition.
condZ
// condNZ represents (not-zero) condition.
condNZ
// condBE represents (<= unsigned) condition.
condBE
// condNBE represents (> unsigned) condition.
condNBE
// condS represents (negative) condition.
condS
// condNS represents (not-negative) condition.
condNS
// condP represents (parity) condition.
condP
// condNP represents (not parity) condition.
condNP
// condL represents (< signed) condition.
condL
// condNL represents (>= signed) condition.
condNL
// condLE represents (<= signed) condition.
condLE
// condNLE represents (> signed) condition.
condNLE
condInvalid
)
func (c cond) String() string {
switch c {
case condO:
return "o"
case condNO:
return "no"
case condB:
return "b"
case condNB:
return "nb"
case condZ:
return "z"
case condNZ:
return "nz"
case condBE:
return "be"
case condNBE:
return "nbe"
case condS:
return "s"
case condNS:
return "ns"
case condL:
return "l"
case condNL:
return "nl"
case condLE:
return "le"
case condNLE:
return "nle"
case condP:
return "p"
case condNP:
return "np"
default:
panic("unreachable")
}
}
func condFromSSAIntCmpCond(origin ssa.IntegerCmpCond) cond {
switch origin {
case ssa.IntegerCmpCondEqual:
return condZ
case ssa.IntegerCmpCondNotEqual:
return condNZ
case ssa.IntegerCmpCondSignedLessThan:
return condL
case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
return condNL
case ssa.IntegerCmpCondSignedGreaterThan:
return condNLE
case ssa.IntegerCmpCondSignedLessThanOrEqual:
return condLE
case ssa.IntegerCmpCondUnsignedLessThan:
return condB
case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
return condNB
case ssa.IntegerCmpCondUnsignedGreaterThan:
return condNBE
case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
return condBE
default:
panic("unreachable")
}
}
func condFromSSAFloatCmpCond(origin ssa.FloatCmpCond) cond {
switch origin {
case ssa.FloatCmpCondGreaterThanOrEqual:
return condNB
case ssa.FloatCmpCondGreaterThan:
return condNBE
case ssa.FloatCmpCondEqual, ssa.FloatCmpCondNotEqual, ssa.FloatCmpCondLessThan, ssa.FloatCmpCondLessThanOrEqual:
panic(fmt.Sprintf("cond %s must be treated as a special case", origin))
default:
panic("unreachable")
}
}
func (c cond) encoding() byte {
return byte(c)
}
func (c cond) invert() cond {
switch c {
case condO:
return condNO
case condNO:
return condO
case condB:
return condNB
case condNB:
return condB
case condZ:
return condNZ
case condNZ:
return condZ
case condBE:
return condNBE
case condNBE:
return condBE
case condS:
return condNS
case condNS:
return condS
case condP:
return condNP
case condNP:
return condP
case condL:
return condNL
case condNL:
return condL
case condLE:
return condNLE
case condNLE:
return condLE
default:
panic("unreachable")
}
}

View File

@ -0,0 +1,35 @@
package amd64
// extMode represents the mode of extension in movzx/movsx.
type extMode byte
const (
// extModeBL represents Byte -> Longword.
extModeBL extMode = iota
// extModeBQ represents Byte -> Quadword.
extModeBQ
// extModeWL represents Word -> Longword.
extModeWL
// extModeWQ represents Word -> Quadword.
extModeWQ
// extModeLQ represents Longword -> Quadword.
extModeLQ
)
// String implements fmt.Stringer.
func (e extMode) String() string {
switch e {
case extModeBL:
return "bl"
case extModeBQ:
return "bq"
case extModeWL:
return "wl"
case extModeWQ:
return "wq"
case extModeLQ:
return "lq"
default:
panic("BUG: invalid ext mode")
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,71 @@
package amd64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
val := instr.Return()
valType := val.Type()
vr = m.c.AllocateVReg(valType)
m.insertLoadConstant(instr, vr)
return
}
// InsertLoadConstantBlockArg implements backend.Machine.
func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
m.insertLoadConstant(instr, vr)
}
func (m *machine) insertLoadConstant(instr *ssa.Instruction, vr regalloc.VReg) {
val := instr.Return()
valType := val.Type()
v := instr.ConstantVal()
bits := valType.Bits()
if bits < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
v = v & ((1 << valType.Bits()) - 1)
}
switch valType {
case ssa.TypeF32, ssa.TypeF64:
m.lowerFconst(vr, v, bits == 64)
case ssa.TypeI32, ssa.TypeI64:
m.lowerIconst(vr, v, bits == 64)
default:
panic("BUG")
}
}
func (m *machine) lowerFconst(dst regalloc.VReg, c uint64, _64 bool) {
if c == 0 {
xor := m.allocateInstr().asZeros(dst)
m.insert(xor)
} else {
var tmpType ssa.Type
if _64 {
tmpType = ssa.TypeI64
} else {
tmpType = ssa.TypeI32
}
tmpInt := m.c.AllocateVReg(tmpType)
loadToGP := m.allocateInstr().asImm(tmpInt, c, _64)
m.insert(loadToGP)
movToXmm := m.allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(tmpInt), dst, _64)
m.insert(movToXmm)
}
}
func (m *machine) lowerIconst(dst regalloc.VReg, c uint64, _64 bool) {
i := m.allocateInstr()
if c == 0 {
i.asZeros(dst)
} else {
i.asImm(dst, c, _64)
}
m.insert(i)
}

View File

@ -0,0 +1,187 @@
package amd64
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
var addendsMatchOpcodes = [...]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst, ssa.OpcodeIshl}
type addend struct {
r regalloc.VReg
off int64
shift byte
}
func (a addend) String() string {
return fmt.Sprintf("addend{r=%s, off=%d, shift=%d}", a.r, a.off, a.shift)
}
// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32) (am *amode) {
def := m.c.ValueDefinition(ptr)
if offsetBase&0x80000000 != 0 {
// Special casing the huge base offset whose MSB is set. In x64, the immediate is always
// sign-extended, but our IR semantics requires the offset base is always unsigned.
// Note that this should be extremely rare or even this shouldn't hit in the real application,
// therefore we don't need to optimize this case in my opinion.
a := m.lowerAddend(def)
off64 := a.off + int64(offsetBase)
offsetBaseReg := m.c.AllocateVReg(ssa.TypeI64)
m.lowerIconst(offsetBaseReg, uint64(off64), true)
if a.r != regalloc.VRegInvalid {
return m.newAmodeRegRegShift(0, offsetBaseReg, a.r, a.shift)
} else {
return m.newAmodeImmReg(0, offsetBaseReg)
}
}
if op := m.c.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op == ssa.OpcodeIadd {
add := def.Instr
x, y := add.Arg2()
xDef, yDef := m.c.ValueDefinition(x), m.c.ValueDefinition(y)
ax := m.lowerAddend(xDef)
ay := m.lowerAddend(yDef)
add.MarkLowered()
return m.lowerAddendsToAmode(ax, ay, offsetBase)
} else {
// If it is not an Iadd, then we lower the one addend.
a := m.lowerAddend(def)
// off is always 0 if r is valid.
if a.r != regalloc.VRegInvalid {
if a.shift != 0 {
tmpReg := m.c.AllocateVReg(ssa.TypeI64)
m.lowerIconst(tmpReg, 0, true)
return m.newAmodeRegRegShift(offsetBase, tmpReg, a.r, a.shift)
}
return m.newAmodeImmReg(offsetBase, a.r)
} else {
off64 := a.off + int64(offsetBase)
tmpReg := m.c.AllocateVReg(ssa.TypeI64)
m.lowerIconst(tmpReg, uint64(off64), true)
return m.newAmodeImmReg(0, tmpReg)
}
}
}
func (m *machine) lowerAddendsToAmode(x, y addend, offBase uint32) *amode {
if x.r != regalloc.VRegInvalid && x.off != 0 || y.r != regalloc.VRegInvalid && y.off != 0 {
panic("invalid input")
}
u64 := uint64(x.off+y.off) + uint64(offBase)
if u64 != 0 {
if _, ok := asImm32(u64, false); !ok {
tmpReg := m.c.AllocateVReg(ssa.TypeI64)
m.lowerIconst(tmpReg, u64, true)
// Blank u64 as it has been already lowered.
u64 = 0
if x.r == regalloc.VRegInvalid {
x.r = tmpReg
} else if y.r == regalloc.VRegInvalid {
y.r = tmpReg
} else {
// We already know that either rx or ry is invalid,
// so we overwrite it with the temporary register.
panic("BUG")
}
}
}
u32 := uint32(u64)
switch {
// We assume rx, ry are valid iff offx, offy are 0.
case x.r != regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
switch {
case x.shift != 0 && y.shift != 0:
// Cannot absorb two shifted registers, must lower one to a shift instruction.
shifted := m.allocateInstr()
shifted.asShiftR(shiftROpShiftLeft, newOperandImm32(uint32(x.shift)), x.r, true)
m.insert(shifted)
return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
case x.shift != 0 && y.shift == 0:
// Swap base and index.
x, y = y, x
fallthrough
default:
return m.newAmodeRegRegShift(u32, x.r, y.r, y.shift)
}
case x.r == regalloc.VRegInvalid && y.r != regalloc.VRegInvalid:
x, y = y, x
fallthrough
case x.r != regalloc.VRegInvalid && y.r == regalloc.VRegInvalid:
if x.shift != 0 {
zero := m.c.AllocateVReg(ssa.TypeI64)
m.lowerIconst(zero, 0, true)
return m.newAmodeRegRegShift(u32, zero, x.r, x.shift)
}
return m.newAmodeImmReg(u32, x.r)
default: // Both are invalid: use the offset.
tmpReg := m.c.AllocateVReg(ssa.TypeI64)
m.lowerIconst(tmpReg, u64, true)
return m.newAmodeImmReg(0, tmpReg)
}
}
func (m *machine) lowerAddend(x *backend.SSAValueDefinition) addend {
if x.IsFromBlockParam() {
return addend{x.BlkParamVReg, 0, 0}
}
// Ensure the addend is not referenced in multiple places; we will discard nested Iadds.
op := m.c.MatchInstrOneOf(x, addendsMatchOpcodes[:])
if op != ssa.OpcodeInvalid && op != ssa.OpcodeIadd {
return m.lowerAddendFromInstr(x.Instr)
}
p := m.getOperand_Reg(x)
return addend{p.reg(), 0, 0}
}
// lowerAddendFromInstr takes an instruction returns a Vreg and an offset that can be used in an address mode.
// The Vreg is regalloc.VRegInvalid if the addend cannot be lowered to a register.
// The offset is 0 if the addend can be lowered to a register.
func (m *machine) lowerAddendFromInstr(instr *ssa.Instruction) addend {
instr.MarkLowered()
switch op := instr.Opcode(); op {
case ssa.OpcodeIconst:
u64 := instr.ConstantVal()
if instr.Return().Type().Bits() == 32 {
return addend{regalloc.VRegInvalid, int64(int32(u64)), 0} // sign-extend.
} else {
return addend{regalloc.VRegInvalid, int64(u64), 0}
}
case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
input := instr.Arg()
inputDef := m.c.ValueDefinition(input)
if input.Type().Bits() != 32 {
panic("BUG: invalid input type " + input.Type().String())
}
constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
switch {
case constInst && op == ssa.OpcodeSExtend:
return addend{regalloc.VRegInvalid, int64(uint32(inputDef.Instr.ConstantVal())), 0}
case constInst && op == ssa.OpcodeUExtend:
return addend{regalloc.VRegInvalid, int64(int32(inputDef.Instr.ConstantVal())), 0} // sign-extend!
default:
r := m.getOperand_Reg(inputDef)
return addend{r.reg(), 0, 0}
}
case ssa.OpcodeIshl:
// If the addend is a shift, we can only handle it if the shift amount is a constant.
x, amount := instr.Arg2()
amountDef := m.c.ValueDefinition(amount)
if amountDef.IsFromInstr() && amountDef.Instr.Constant() && amountDef.Instr.ConstantVal() <= 3 {
r := m.getOperand_Reg(m.c.ValueDefinition(x))
return addend{r.reg(), 0, uint8(amountDef.Instr.ConstantVal())}
}
r := m.getOperand_Reg(m.c.ValueDefinition(x))
return addend{r.reg(), 0, 0}
}
panic("BUG: invalid opcode")
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,304 @@
package amd64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
)
// PostRegAlloc implements backend.Machine.
func (m *machine) PostRegAlloc() {
m.setupPrologue()
m.postRegAlloc()
}
func (m *machine) setupPrologue() {
cur := m.ectx.RootInstr
prevInitInst := cur.next
// At this point, we have the stack layout as follows:
//
// (high address)
// +-----------------+ <----- RBP (somewhere in the middle of the stack)
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | Return Addr |
// RSP ----> +-----------------+
// (low address)
// First, we push the RBP, and update the RBP to the current RSP.
//
// (high address) (high address)
// RBP ----> +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | ====> | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | Return Addr | | Return Addr |
// RSP ----> +-----------------+ | Caller_RBP |
// (low address) +-----------------+ <----- RSP, RBP
//
cur = m.setupRBPRSP(cur)
if !m.stackBoundsCheckDisabled {
cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
}
//
// (high address)
// +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | xxxxx | | xxxxx |
// | Return Addr | | Return Addr |
// | Caller_RBP | ====> | Caller_RBP |
// RBP,RSP->+-----------------+ +-----------------+ <----- RBP
// (low address) | clobbered M |
// | clobbered 1 |
// | ........... |
// | clobbered 0 |
// +-----------------+ <----- RSP
//
if regs := m.clobberedRegs; len(regs) > 0 {
for i := range regs {
r := regs[len(regs)-1-i] // Reverse order.
if r.RegType() == regalloc.RegTypeInt {
cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(r)))
} else {
// Push the XMM register is not supported by the PUSH instruction.
cur = m.addRSP(-16, cur)
push := m.allocateInstr().asXmmMovRM(
sseOpcodeMovdqu, r, newOperandMem(m.newAmodeImmReg(0, rspVReg)),
)
cur = linkInstr(cur, push)
}
}
}
if size := m.spillSlotSize; size > 0 {
// Simply decrease the RSP to allocate the spill slots.
// sub $size, %rsp
cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(uint32(size)), rspVReg, true))
// At this point, we have the stack layout as follows:
//
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | ReturnAddress |
// | Caller_RBP |
// +-----------------+ <--- RBP
// | clobbered M |
// | ............ |
// | clobbered 1 |
// | clobbered 0 |
// | spill slot N |
// | ............ |
// | spill slot 0 |
// +-----------------+ <--- RSP
// (low address)
}
linkInstr(cur, prevInitInst)
}
// postRegAlloc does multiple things while walking through the instructions:
// 1. Inserts the epilogue code.
// 2. Removes the redundant copy instruction.
// 3. Inserts the dec/inc RSP instruction right before/after the call instruction.
// 4. Lowering that is supposed to be done after regalloc.
func (m *machine) postRegAlloc() {
ectx := m.ectx
for cur := ectx.RootInstr; cur != nil; cur = cur.next {
switch k := cur.kind; k {
case ret:
m.setupEpilogueAfter(cur.prev)
continue
case fcvtToSintSequence, fcvtToUintSequence:
m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
if k == fcvtToSintSequence {
m.lowerFcvtToSintSequenceAfterRegalloc(cur)
} else {
m.lowerFcvtToUintSequenceAfterRegalloc(cur)
}
prev := cur.prev
next := cur.next
cur := prev
for _, instr := range m.ectx.PendingInstructions {
cur = linkInstr(cur, instr)
}
linkInstr(cur, next)
continue
case xmmCMov:
m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
m.lowerXmmCmovAfterRegAlloc(cur)
prev := cur.prev
next := cur.next
cur := prev
for _, instr := range m.ectx.PendingInstructions {
cur = linkInstr(cur, instr)
}
linkInstr(cur, next)
continue
case idivRemSequence:
m.ectx.PendingInstructions = m.ectx.PendingInstructions[:0]
m.lowerIDivRemSequenceAfterRegAlloc(cur)
prev := cur.prev
next := cur.next
cur := prev
for _, instr := range m.ectx.PendingInstructions {
cur = linkInstr(cur, instr)
}
linkInstr(cur, next)
continue
case call, callIndirect:
// At this point, reg alloc is done, therefore we can safely insert dec/inc RPS instruction
// right before/after the call instruction. If this is done before reg alloc, the stack slot
// can point to the wrong location and therefore results in a wrong value.
call := cur
next := call.next
_, _, _, _, size := backend.ABIInfoFromUint64(call.u2)
if size > 0 {
dec := m.allocateInstr().asAluRmiR(aluRmiROpcodeSub, newOperandImm32(size), rspVReg, true)
linkInstr(call.prev, dec)
linkInstr(dec, call)
inc := m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(size), rspVReg, true)
linkInstr(call, inc)
linkInstr(inc, next)
}
continue
}
// Removes the redundant copy instruction.
if cur.IsCopy() && cur.op1.reg().RealReg() == cur.op2.reg().RealReg() {
prev, next := cur.prev, cur.next
// Remove the copy instruction.
prev.next = next
if next != nil {
next.prev = prev
}
}
}
}
func (m *machine) setupEpilogueAfter(cur *instruction) {
prevNext := cur.next
// At this point, we have the stack layout as follows:
//
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | ReturnAddress |
// | Caller_RBP |
// +-----------------+ <--- RBP
// | clobbered M |
// | ............ |
// | clobbered 1 |
// | clobbered 0 |
// | spill slot N |
// | ............ |
// | spill slot 0 |
// +-----------------+ <--- RSP
// (low address)
if size := m.spillSlotSize; size > 0 {
// Simply increase the RSP to free the spill slots.
// add $size, %rsp
cur = linkInstr(cur, m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(uint32(size)), rspVReg, true))
}
//
// (high address)
// +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | ReturnAddress | | ReturnAddress |
// | Caller_RBP | | Caller_RBP |
// RBP ---> +-----------------+ ========> +-----------------+ <---- RSP, RBP
// | clobbered M |
// | ............ |
// | clobbered 1 |
// | clobbered 0 |
// RSP ---> +-----------------+
// (low address)
//
if regs := m.clobberedRegs; len(regs) > 0 {
for _, r := range regs {
if r.RegType() == regalloc.RegTypeInt {
cur = linkInstr(cur, m.allocateInstr().asPop64(r))
} else {
// Pop the XMM register is not supported by the POP instruction.
pop := m.allocateInstr().asXmmUnaryRmR(
sseOpcodeMovdqu, newOperandMem(m.newAmodeImmReg(0, rspVReg)), r,
)
cur = linkInstr(cur, pop)
cur = m.addRSP(16, cur)
}
}
}
// Now roll back the RSP to RBP, and pop the caller's RBP.
cur = m.revertRBPRSP(cur)
linkInstr(cur, prevNext)
}
func (m *machine) addRSP(offset int32, cur *instruction) *instruction {
if offset == 0 {
return cur
}
opcode := aluRmiROpcodeAdd
if offset < 0 {
opcode = aluRmiROpcodeSub
offset = -offset
}
return linkInstr(cur, m.allocateInstr().asAluRmiR(opcode, newOperandImm32(uint32(offset)), rspVReg, true))
}
func (m *machine) setupRBPRSP(cur *instruction) *instruction {
cur = linkInstr(cur, m.allocateInstr().asPush64(newOperandReg(rbpVReg)))
cur = linkInstr(cur, m.allocateInstr().asMovRR(rspVReg, rbpVReg, true))
return cur
}
func (m *machine) revertRBPRSP(cur *instruction) *instruction {
cur = linkInstr(cur, m.allocateInstr().asMovRR(rbpVReg, rspVReg, true))
cur = linkInstr(cur, m.allocateInstr().asPop64(rbpVReg))
return cur
}

View File

@ -0,0 +1,153 @@
package amd64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// InsertMoveBefore implements backend.RegAllocFunctionMachine.
func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
typ := src.RegType()
if typ != dst.RegType() {
panic("BUG: src and dst must have the same type")
}
mov := m.allocateInstr()
if typ == regalloc.RegTypeInt {
mov.asMovRR(src, dst, true)
} else {
mov.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(src), dst)
}
cur := instr.prev
prevNext := cur.next
cur = linkInstr(cur, mov)
linkInstr(cur, prevNext)
}
// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
if !v.IsRealReg() {
panic("BUG: VReg must be backed by real reg to be stored")
}
typ := m.c.TypeOf(v)
var prevNext, cur *instruction
if after {
cur, prevNext = instr, instr.next
} else {
cur, prevNext = instr.prev, instr
}
offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
store := m.allocateInstr()
mem := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
switch typ {
case ssa.TypeI32:
store.asMovRM(v, mem, 4)
case ssa.TypeI64:
store.asMovRM(v, mem, 8)
case ssa.TypeF32:
store.asXmmMovRM(sseOpcodeMovss, v, mem)
case ssa.TypeF64:
store.asXmmMovRM(sseOpcodeMovsd, v, mem)
case ssa.TypeV128:
store.asXmmMovRM(sseOpcodeMovdqu, v, mem)
}
cur = linkInstr(cur, store)
return linkInstr(cur, prevNext)
}
// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
if !v.IsRealReg() {
panic("BUG: VReg must be backed by real reg to be stored")
}
typ := m.c.TypeOf(v)
var prevNext, cur *instruction
if after {
cur, prevNext = instr, instr.next
} else {
cur, prevNext = instr.prev, instr
}
// Load the value to the temporary.
load := m.allocateInstr()
offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
a := newOperandMem(m.newAmodeImmReg(uint32(offsetFromSP), rspVReg))
switch typ {
case ssa.TypeI32:
load.asMovzxRmR(extModeLQ, a, v)
case ssa.TypeI64:
load.asMov64MR(a, v)
case ssa.TypeF32:
load.asXmmUnaryRmR(sseOpcodeMovss, a, v)
case ssa.TypeF64:
load.asXmmUnaryRmR(sseOpcodeMovsd, a, v)
case ssa.TypeV128:
load.asXmmUnaryRmR(sseOpcodeMovdqu, a, v)
default:
panic("BUG")
}
cur = linkInstr(cur, load)
return linkInstr(cur, prevNext)
}
// ClobberedRegisters implements backend.RegAllocFunctionMachine.
func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
}
// Swap implements backend.RegAllocFunctionMachine.
func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
if x1.RegType() == regalloc.RegTypeInt {
prevNext := cur.next
xc := m.allocateInstr().asXCHG(x1, newOperandReg(x2), 8)
cur = linkInstr(cur, xc)
linkInstr(cur, prevNext)
} else {
if tmp.Valid() {
prevNext := cur.next
m.InsertMoveBefore(tmp, x1, prevNext)
m.InsertMoveBefore(x1, x2, prevNext)
m.InsertMoveBefore(x2, tmp, prevNext)
} else {
prevNext := cur.next
r2 := x2.RealReg()
// Temporarily spill x1 to stack.
cur = m.InsertStoreRegisterAt(x1, cur, true).prev
// Then move x2 to x1.
cur = linkInstr(cur, m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqa, newOperandReg(x2), x1))
linkInstr(cur, prevNext)
// Then reload the original value on x1 from stack to r2.
m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
}
}
}
// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
cur := end
for cur.kind == nop0 {
cur = cur.prev
if cur == begin {
return end
}
}
switch cur.kind {
case jmp:
return cur
default:
return end
}
}
// SSABlockLabel implements backend.RegAllocFunctionMachine.
func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
return m.ectx.SsaBlockIDToLabels[id]
}

View File

@ -0,0 +1,992 @@
package amd64
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
var swizzleMask = [16]byte{
0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
}
func (m *machine) lowerSwizzle(x, y ssa.Value, ret ssa.Value) {
masklabel := m.getOrAllocateConstLabel(&m.constSwizzleMaskConstIndex, swizzleMask[:])
// Load mask to maskReg.
maskReg := m.c.AllocateVReg(ssa.TypeV128)
loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(masklabel)), maskReg)
m.insert(loadMask)
// Copy x and y to tmp registers.
xx := m.getOperand_Reg(m.c.ValueDefinition(x))
tmpDst := m.copyToTmp(xx.reg())
yy := m.getOperand_Reg(m.c.ValueDefinition(y))
tmpX := m.copyToTmp(yy.reg())
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(maskReg), tmpX))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpX), tmpDst))
// Copy the result to the destination register.
m.copyTo(tmpDst, m.c.VRegOf(ret))
}
func (m *machine) lowerInsertLane(x, y ssa.Value, index byte, ret ssa.Value, lane ssa.VecLane) {
// Copy x to tmp.
tmpDst := m.c.AllocateVReg(ssa.TypeV128)
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, m.getOperand_Mem_Reg(m.c.ValueDefinition(x)), tmpDst))
yy := m.getOperand_Reg(m.c.ValueDefinition(y))
switch lane {
case ssa.VecLaneI8x16:
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, index, yy, tmpDst))
case ssa.VecLaneI16x8:
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, index, yy, tmpDst))
case ssa.VecLaneI32x4:
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, index, yy, tmpDst))
case ssa.VecLaneI64x2:
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, index, yy, tmpDst))
case ssa.VecLaneF32x4:
// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
// See https://www.felixcloutier.com/x86/insertps
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, index<<4, yy, tmpDst))
case ssa.VecLaneF64x2:
if index == 0 {
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, yy, tmpDst))
} else {
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMovlhps, yy, tmpDst))
}
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
m.copyTo(tmpDst, m.c.VRegOf(ret))
}
func (m *machine) lowerExtractLane(x ssa.Value, index byte, signed bool, ret ssa.Value, lane ssa.VecLane) {
// Pextr variants are used to extract a lane from a vector register.
xx := m.getOperand_Reg(m.c.ValueDefinition(x))
tmpDst := m.c.AllocateVReg(ret.Type())
m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
switch lane {
case ssa.VecLaneI8x16:
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrb, index, xx, tmpDst))
if signed {
m.insert(m.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
} else {
m.insert(m.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(tmpDst), tmpDst))
}
case ssa.VecLaneI16x8:
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrw, index, xx, tmpDst))
if signed {
m.insert(m.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
} else {
m.insert(m.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(tmpDst), tmpDst))
}
case ssa.VecLaneI32x4:
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrd, index, xx, tmpDst))
case ssa.VecLaneI64x2:
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, index, xx, tmpDst))
case ssa.VecLaneF32x4:
if index == 0 {
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, xx, tmpDst))
} else {
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, index, xx, tmpDst))
}
case ssa.VecLaneF64x2:
if index == 0 {
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, xx, tmpDst))
} else {
m.copyTo(xx.reg(), tmpDst)
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(tmpDst), tmpDst))
}
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
m.copyTo(tmpDst, m.c.VRegOf(ret))
}
var sqmulRoundSat = [16]byte{
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
}
func (m *machine) lowerSqmulRoundSat(x, y, ret ssa.Value) {
// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
maskLabel := m.getOrAllocateConstLabel(&m.constSqmulRoundSatIndex, sqmulRoundSat[:])
tmp := m.c.AllocateVReg(ssa.TypeV128)
loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp)
m.insert(loadMask)
xx, yy := m.getOperand_Reg(m.c.ValueDefinition(x)), m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
tmpX := m.copyToTmp(xx.reg())
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, yy, tmpX))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmpX), tmp))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmpX))
m.copyTo(tmpX, m.c.VRegOf(ret))
}
func (m *machine) lowerVUshr(x, y, ret ssa.Value, lane ssa.VecLane) {
switch lane {
case ssa.VecLaneI8x16:
m.lowerVUshri8x16(x, y, ret)
case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2:
m.lowerShr(x, y, ret, lane, false)
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
}
// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
}
func (m *machine) lowerVUshri8x16(x, y, ret ssa.Value) {
tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
// Load the modulo 8 mask to tmpReg.
m.lowerIconst(tmpGpReg, 0x7, false)
// Take the modulo 8 of the shift amount.
shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, tmpGpReg, false))
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
vecTmp := m.c.AllocateVReg(ssa.TypeV128)
m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), vecTmp, false))
m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(vecTmp), xx))
maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:])
base := m.c.AllocateVReg(ssa.TypeI64)
lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
m.insert(lea)
// Shift tmpGpReg by 4 to multiply the shift amount by 16.
m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), vecTmp)
m.insert(loadMask)
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(vecTmp), xx))
m.copyTo(xx, m.c.VRegOf(ret))
}
func (m *machine) lowerVSshr(x, y, ret ssa.Value, lane ssa.VecLane) {
switch lane {
case ssa.VecLaneI8x16:
m.lowerVSshri8x16(x, y, ret)
case ssa.VecLaneI16x8, ssa.VecLaneI32x4:
m.lowerShr(x, y, ret, lane, true)
case ssa.VecLaneI64x2:
m.lowerVSshri64x2(x, y, ret)
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
}
func (m *machine) lowerVSshri8x16(x, y, ret ssa.Value) {
shiftAmtReg := m.c.AllocateVReg(ssa.TypeI32)
// Load the modulo 8 mask to tmpReg.
m.lowerIconst(shiftAmtReg, 0x7, false)
// Take the modulo 8 of the shift amount.
shiftAmt := m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y))
m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, shiftAmt, shiftAmtReg, false))
// Copy the x value to two temporary registers.
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
vecTmp := m.c.AllocateVReg(ssa.TypeV128)
m.copyTo(xx, vecTmp)
// Assuming that we have
// xx = [b1, ..., b16]
// vecTmp = [b1, ..., b16]
// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
// xx = [b1, b1, b2, b2, ..., b8, b8]
// vecTmp = [b9, b9, b10, b10, ..., b16, b16]
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(xx), xx))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(vecTmp), vecTmp))
// Adding 8 to the shift amount, and then move the amount to vecTmp2.
vecTmp2 := m.c.AllocateVReg(ssa.TypeV128)
m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), shiftAmtReg, false))
m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(shiftAmtReg), vecTmp2, false))
// Perform the word packed arithmetic right shifts on vreg and vecTmp.
// This changes these two registers as:
// xx = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
// vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), xx))
m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(vecTmp2), vecTmp))
// Finally, we can get the result by packing these two word vectors.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(vecTmp), xx))
m.copyTo(xx, m.c.VRegOf(ret))
}
func (m *machine) lowerVSshri64x2(x, y, ret ssa.Value) {
// Load the shift amount to RCX.
shiftAmt := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, shiftAmt, rcxVReg))
tmpGp := m.c.AllocateVReg(ssa.TypeI64)
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xxReg := m.copyToTmp(_xx.reg())
m.insert(m.allocateInstr().asDefineUninitializedReg(tmpGp))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(xxReg), tmpGp))
m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), xxReg))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(xxReg), tmpGp))
m.insert(m.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), tmpGp, true))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), xxReg))
m.copyTo(xxReg, m.c.VRegOf(ret))
}
func (m *machine) lowerShr(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
var modulo uint64
var shiftOp sseOpcode
switch lane {
case ssa.VecLaneI16x8:
modulo = 0xf
if signed {
shiftOp = sseOpcodePsraw
} else {
shiftOp = sseOpcodePsrlw
}
case ssa.VecLaneI32x4:
modulo = 0x1f
if signed {
shiftOp = sseOpcodePsrad
} else {
shiftOp = sseOpcodePsrld
}
case ssa.VecLaneI64x2:
modulo = 0x3f
if signed {
panic("BUG")
}
shiftOp = sseOpcodePsrlq
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
// Load the modulo 8 mask to tmpReg.
m.lowerIconst(tmpGpReg, modulo, false)
// Take the modulo 8 of the shift amount.
m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
// And move it to a xmm register.
tmpVec := m.c.AllocateVReg(ssa.TypeV128)
m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
// Then do the actual shift.
m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
m.copyTo(xx, m.c.VRegOf(ret))
}
func (m *machine) lowerVIshl(x, y, ret ssa.Value, lane ssa.VecLane) {
var modulo uint64
var shiftOp sseOpcode
var isI8x16 bool
switch lane {
case ssa.VecLaneI8x16:
isI8x16 = true
modulo = 0x7
shiftOp = sseOpcodePsllw
case ssa.VecLaneI16x8:
modulo = 0xf
shiftOp = sseOpcodePsllw
case ssa.VecLaneI32x4:
modulo = 0x1f
shiftOp = sseOpcodePslld
case ssa.VecLaneI64x2:
modulo = 0x3f
shiftOp = sseOpcodePsllq
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
tmpGpReg := m.c.AllocateVReg(ssa.TypeI32)
// Load the modulo 8 mask to tmpReg.
m.lowerIconst(tmpGpReg, modulo, false)
// Take the modulo 8 of the shift amount.
m.insert(m.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
m.getOperand_Mem_Imm32_Reg(m.c.ValueDefinition(y)), tmpGpReg, false))
// And move it to a xmm register.
tmpVec := m.c.AllocateVReg(ssa.TypeV128)
m.insert(m.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(tmpGpReg), tmpVec, false))
// Then do the actual shift.
m.insert(m.allocateInstr().asXmmRmiReg(shiftOp, newOperandReg(tmpVec), xx))
if isI8x16 {
maskTableLabel := m.getOrAllocateConstLabel(&m.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:])
base := m.c.AllocateVReg(ssa.TypeI64)
lea := m.allocateInstr().asLEA(newOperandLabel(maskTableLabel), base)
m.insert(lea)
// Shift tmpGpReg by 4 to multiply the shift amount by 16.
m.insert(m.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), tmpGpReg, false))
mem := m.newAmodeRegRegShift(0, base, tmpGpReg, 0)
loadMask := m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(mem), tmpVec)
m.insert(loadMask)
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(tmpVec), xx))
}
m.copyTo(xx, m.c.VRegOf(ret))
}
// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
}
func (m *machine) lowerVRound(x, ret ssa.Value, imm byte, _64 bool) {
xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
var round sseOpcode
if _64 {
round = sseOpcodeRoundpd
} else {
round = sseOpcodeRoundps
}
m.insert(m.allocateInstr().asXmmUnaryRmRImm(round, imm, xx, m.c.VRegOf(ret)))
}
var (
allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80}
extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00}
)
func (m *machine) lowerExtIaddPairwise(x, ret ssa.Value, srcLane ssa.VecLane, signed bool) {
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
switch srcLane {
case ssa.VecLaneI8x16:
allOneReg := m.c.AllocateVReg(ssa.TypeV128)
mask := m.getOrAllocateConstLabel(&m.constAllOnesI8x16Index, allOnesI8x16[:])
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOneReg))
var resultReg regalloc.VReg
if signed {
resultReg = allOneReg
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(xx), resultReg))
} else {
// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
resultReg = xx
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(allOneReg), resultReg))
}
m.copyTo(resultReg, m.c.VRegOf(ret))
case ssa.VecLaneI16x8:
if signed {
allOnesReg := m.c.AllocateVReg(ssa.TypeV128)
mask := m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), allOnesReg))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(allOnesReg), xx))
m.copyTo(xx, m.c.VRegOf(ret))
} else {
maskReg := m.c.AllocateVReg(ssa.TypeV128)
mask := m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:])
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
// Flip the sign bits on xx.
//
// Assuming that xx = [w1, ..., w8], now we have,
// xx[i] = int8(-w1) for i = 0...8
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(maskReg), xx))
mask = m.getOrAllocateConstLabel(&m.constAllOnesI16x8Index, allOnesI16x8[:])
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
// For i = 0,..4 (as this results in i32x4 lanes), now we have
// xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
// c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(maskReg), xx))
mask = m.getOrAllocateConstLabel(&m.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:])
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(mask)), maskReg))
// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
// c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(maskReg), xx))
m.copyTo(xx, m.c.VRegOf(ret))
}
default:
panic(fmt.Sprintf("invalid lane type: %s", srcLane))
}
}
func (m *machine) lowerWidenLow(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
var sseOp sseOpcode
switch lane {
case ssa.VecLaneI8x16:
if signed {
sseOp = sseOpcodePmovsxbw
} else {
sseOp = sseOpcodePmovzxbw
}
case ssa.VecLaneI16x8:
if signed {
sseOp = sseOpcodePmovsxwd
} else {
sseOp = sseOpcodePmovzxwd
}
case ssa.VecLaneI32x4:
if signed {
sseOp = sseOpcodePmovsxdq
} else {
sseOp = sseOpcodePmovzxdq
}
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, xx, m.c.VRegOf(ret)))
}
func (m *machine) lowerWidenHigh(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
tmp := m.c.AllocateVReg(ssa.TypeV128)
xx := m.getOperand_Reg(m.c.ValueDefinition(x))
m.copyTo(xx.reg(), tmp)
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(tmp), tmp))
var sseOp sseOpcode
switch lane {
case ssa.VecLaneI8x16:
if signed {
sseOp = sseOpcodePmovsxbw
} else {
sseOp = sseOpcodePmovzxbw
}
case ssa.VecLaneI16x8:
if signed {
sseOp = sseOpcodePmovsxwd
} else {
sseOp = sseOpcodePmovzxwd
}
case ssa.VecLaneI32x4:
if signed {
sseOp = sseOpcodePmovsxdq
} else {
sseOp = sseOpcodePmovzxdq
}
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOp, newOperandReg(tmp), m.c.VRegOf(ret)))
}
func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, ret ssa.Value, lane ssa.VecLane) {
tmpDst, tmpGp := m.c.AllocateVReg(ssa.TypeV128), m.c.AllocateVReg(ssa.TypeI64)
am := newOperandMem(m.lowerToAddressMode(ptr, offset))
m.insert(m.allocateInstr().asDefineUninitializedReg(tmpDst))
switch lane {
case ssa.VecLaneI8x16:
m.insert(m.allocateInstr().asMovzxRmR(extModeBQ, am, tmpGp))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(tmpGp), tmpDst))
tmpZeroVec := m.c.AllocateVReg(ssa.TypeV128)
m.insert(m.allocateInstr().asZeros(tmpZeroVec))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(tmpZeroVec), tmpDst))
case ssa.VecLaneI16x8:
m.insert(m.allocateInstr().asMovzxRmR(extModeWQ, am, tmpGp))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(tmpGp), tmpDst))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(tmpGp), tmpDst))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
case ssa.VecLaneI32x4:
m.insert(m.allocateInstr().asMovzxRmR(extModeLQ, am, tmpGp))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(tmpGp), tmpDst))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(tmpDst), tmpDst))
case ssa.VecLaneI64x2:
m.insert(m.allocateInstr().asMov64MR(am, tmpGp))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(tmpGp), tmpDst))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(tmpGp), tmpDst))
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
m.copyTo(tmpDst, m.c.VRegOf(ret))
}
var f64x2CvtFromIMask = [16]byte{
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}
func (m *machine) lowerVFcvtFromInt(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
switch lane {
case ssa.VecLaneF32x4:
if signed {
xx := m.getOperand_Reg(m.c.ValueDefinition(x))
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, xx, m.c.VRegOf(ret)))
} else {
xx := m.getOperand_Reg(m.c.ValueDefinition(x))
// Copy the value to two temporary registers.
tmp := m.copyToTmp(xx.reg())
tmp2 := m.copyToTmp(xx.reg())
// Clear the higher 16 bits of each 32-bit element.
m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), tmp))
m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), tmp))
// Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(tmp), tmp2))
// Convert the lower 16-bits in tmp.
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
// Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2.
m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), tmp2))
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp2), tmp2))
// Double the converted halved higher 16bits.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp2), tmp2))
// Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(tmp), tmp2))
m.copyTo(tmp2, m.c.VRegOf(ret))
}
case ssa.VecLaneF64x2:
if signed {
xx := m.getOperand_Mem_Reg(m.c.ValueDefinition(x))
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, xx, m.c.VRegOf(ret)))
} else {
maskReg := m.c.AllocateVReg(ssa.TypeV128)
maskLabel := m.getOrAllocateConstLabel(&m.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:])
// maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
// Given that we have xx = [d1, d2, d3, d4], this results in
// xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
// = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
// ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(maskReg), xx))
// maskReg = [float64(0x1.0p52), float64(0x1.0p52)]
maskLabel = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), maskReg))
// Now, we get the result as
// xx = [float64(uint32(d1)), float64(uint32(d2))]
// because the following equality always satisfies:
// float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(maskReg), xx))
m.copyTo(xx, m.c.VRegOf(ret))
}
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
}
var (
// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
i32sMaxOnF64x2 = [16]byte{
0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
}
// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
i32uMaxOnF64x2 = [16]byte{
0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
}
// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
// like addition or subtraction, the resulted floating point holds exactly the same
// bit representations in 32-bit integer on its mantissa.
//
// Note: the name twop52 is common across various compiler ecosystem.
// E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
// E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
twop52 = [16]byte{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
}
)
func (m *machine) lowerVFcvtToIntSat(x, ret ssa.Value, lane ssa.VecLane, signed bool) {
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
switch lane {
case ssa.VecLaneF32x4:
if signed {
tmp := m.copyToTmp(xx)
// Assuming we have xx = [v1, v2, v3, v4].
//
// Set all bits if lane is not NaN on tmp.
// tmp[i] = 0xffffffff if vi != NaN
// = 0 if vi == NaN
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
// Clear NaN lanes on xx, meaning that
// xx[i] = vi if vi != NaN
// 0 if vi == NaN
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp), xx))
// tmp[i] = ^vi if vi != NaN
// = 0xffffffff if vi == NaN
// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(xx), tmp))
// xx[i] = int32(vi) if vi != NaN and xx is not overflowing.
// = 0x80000000 if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
// = 0 if vi == NaN
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
//
// tmp[i] = 0x80000000 if vi is positive
// = any satisfying any&0x80000000 = 0 if vi is negative or zero.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(xx), tmp))
// Arithmetic right shifting tmp by 31, meaning that we have
// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), tmp))
// Flipping 0x80000000 if vi is positive, otherwise keep intact.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), xx))
} else {
tmp := m.c.AllocateVReg(ssa.TypeV128)
m.insert(m.allocateInstr().asZeros(tmp))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(tmp), xx))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(tmp), tmp))
m.insert(m.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), tmp))
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(tmp), tmp))
tmp2 := m.copyToTmp(xx)
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(xx), xx))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(tmp), tmp2))
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(tmp2), tmp))
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(tmp2), tmp2))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp2))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(tmp), tmp))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(tmp), tmp2))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(tmp2), xx))
}
case ssa.VecLaneF64x2:
tmp2 := m.c.AllocateVReg(ssa.TypeV128)
if signed {
tmp := m.copyToTmp(xx)
// Set all bits for non-NaN lanes, zeros otherwise.
// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(tmp), tmp))
maskLabel := m.getOrAllocateConstLabel(&m.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:])
// Load the 2147483647 into tmp2's each lane.
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskLabel)), tmp2))
// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(tmp2), tmp))
// MINPD returns the source register's value as-is, so we have
// xx[i] = vi if vi != NaN
// = 0 if vi == NaN
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp), xx))
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(xx), xx))
} else {
tmp := m.c.AllocateVReg(ssa.TypeV128)
m.insert(m.allocateInstr().asZeros(tmp))
// xx[i] = vi if vi != NaN && vi > 0
// = 0 if vi == NaN || vi <= 0
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(tmp), xx))
// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
maskIndex := m.getOrAllocateConstLabel(&m.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:])
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
// xx[i] = vi if vi != NaN && vi > 0 && vi <= math.MaxUint32
// = 0 otherwise
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(tmp2), xx))
// Round the floating points into integer.
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(xx), xx))
// tmp2[i] = float64(0x1.0p52)
maskIndex = m.getOrAllocateConstLabel(&m.constTwop52Index, twop52[:])
m.insert(m.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(m.newAmodeRipRel(maskIndex)), tmp2))
// xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
// = 0 otherwise
//
// This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(tmp2), xx))
// At this point, we have
// xx = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
// tmp = [0, 0, 0, 0]
// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
// xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0]
// meaning that for i = 0 and 1, we have
// xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
// = 0 otherwise.
m.insert(m.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(tmp), xx))
}
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
m.copyTo(xx, m.c.VRegOf(ret))
}
func (m *machine) lowerNarrow(x, y, ret ssa.Value, lane ssa.VecLane, signed bool) {
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
var sseOp sseOpcode
switch lane {
case ssa.VecLaneI16x8:
if signed {
sseOp = sseOpcodePacksswb
} else {
sseOp = sseOpcodePackuswb
}
case ssa.VecLaneI32x4:
if signed {
sseOp = sseOpcodePackssdw
} else {
sseOp = sseOpcodePackusdw
}
default:
panic(fmt.Sprintf("invalid lane type: %s", lane))
}
m.insert(m.allocateInstr().asXmmRmR(sseOp, yy, xx))
m.copyTo(xx, m.c.VRegOf(ret))
}
func (m *machine) lowerWideningPairwiseDotProductS(x, y, ret ssa.Value) {
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
xx := m.copyToTmp(_xx.reg())
yy := m.getOperand_Mem_Reg(m.c.ValueDefinition(y))
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePmaddwd, yy, xx))
m.copyTo(xx, m.c.VRegOf(ret))
}
func (m *machine) lowerVIabs(instr *ssa.Instruction) {
x, lane := instr.ArgWithLane()
rd := m.c.VRegOf(instr.Return())
if lane == ssa.VecLaneI64x2 {
_xx := m.getOperand_Reg(m.c.ValueDefinition(x))
blendReg := xmm0VReg
m.insert(m.allocateInstr().asDefineUninitializedReg(blendReg))
tmp := m.copyToTmp(_xx.reg())
xx := m.copyToTmp(_xx.reg())
// Clear all bits on blendReg.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(blendReg), blendReg))
// Subtract xx from blendMaskReg.
m.insert(m.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(xx), blendReg))
// Copy the subtracted value ^^ back into tmp.
m.copyTo(blendReg, xx)
m.insert(m.allocateInstr().asBlendvpd(newOperandReg(tmp), xx))
m.copyTo(xx, rd)
} else {
var vecOp sseOpcode
switch lane {
case ssa.VecLaneI8x16:
vecOp = sseOpcodePabsb
case ssa.VecLaneI16x8:
vecOp = sseOpcodePabsw
case ssa.VecLaneI32x4:
vecOp = sseOpcodePabsd
}
rn := m.getOperand_Reg(m.c.ValueDefinition(x))
i := m.allocateInstr()
i.asXmmUnaryRmR(vecOp, rn, rd)
m.insert(i)
}
}
func (m *machine) lowerVIpopcnt(instr *ssa.Instruction) {
x := instr.Arg()
rn := m.getOperand_Reg(m.c.ValueDefinition(x))
rd := m.c.VRegOf(instr.Return())
tmp1 := m.c.AllocateVReg(ssa.TypeV128)
m.lowerVconst(tmp1, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f)
// Copy input into tmp2.
tmp2 := m.copyToTmp(rn.reg())
// Given that we have:
// rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
//
// Take PAND on tmp1 and tmp2, so that we mask out all the higher bits.
// tmp2 = [l1, ..., l16].
pand := m.allocateInstr()
pand.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp2)
m.insert(pand)
// Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have
// tmp3 = [h1, ...., h16].
tmp3 := m.copyToTmp(rn.reg())
psrlw := m.allocateInstr()
psrlw.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), tmp3)
m.insert(psrlw)
pand2 := m.allocateInstr()
pand2.asXmmRmR(sseOpcodePand, newOperandReg(tmp1), tmp3)
m.insert(pand2)
// Read the popcntTable into tmp4, and we have
// tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
tmp4 := m.c.AllocateVReg(ssa.TypeV128)
m.lowerVconst(tmp4, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01)
// Make a copy for later.
tmp5 := m.copyToTmp(tmp4)
// tmp4 = [popcnt(l1), ..., popcnt(l16)].
pshufb := m.allocateInstr()
pshufb.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp2), tmp4)
m.insert(pshufb)
pshufb2 := m.allocateInstr()
pshufb2.asXmmRmR(sseOpcodePshufb, newOperandReg(tmp3), tmp5)
m.insert(pshufb2)
// tmp4 + tmp5 is the result.
paddb := m.allocateInstr()
paddb.asXmmRmR(sseOpcodePaddb, newOperandReg(tmp4), tmp5)
m.insert(paddb)
m.copyTo(tmp5, rd)
}
func (m *machine) lowerVImul(instr *ssa.Instruction) {
x, y, lane := instr.Arg2WithLane()
rd := m.c.VRegOf(instr.Return())
if lane == ssa.VecLaneI64x2 {
rn := m.getOperand_Reg(m.c.ValueDefinition(x))
rm := m.getOperand_Reg(m.c.ValueDefinition(y))
// Assuming that we have
// rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
// rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
// Copy rn into tmp1.
tmp1 := m.copyToTmp(rn.reg())
// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
shift := m.allocateInstr()
shift.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp1)
m.insert(shift)
// Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
mul := m.allocateInstr()
mul.asXmmRmR(sseOpcodePmuludq, rm, tmp1)
m.insert(mul)
// Copy rm value into tmp2.
tmp2 := m.copyToTmp(rm.reg())
// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
shift2 := m.allocateInstr()
shift2.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), tmp2)
m.insert(shift2)
// Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
mul2 := m.allocateInstr()
mul2.asXmmRmR(sseOpcodePmuludq, rn, tmp2)
m.insert(mul2)
// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
add := m.allocateInstr()
add.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp2), tmp1)
m.insert(add)
shift3 := m.allocateInstr()
shift3.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), tmp1)
m.insert(shift3)
// Copy rm value into tmp3.
tmp3 := m.copyToTmp(rm.reg())
// "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
mul3 := m.allocateInstr()
mul3.asXmmRmR(sseOpcodePmuludq, rn, tmp3)
m.insert(mul3)
// Finally, we get the result by computing tmp1 + tmp3,
// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
add2 := m.allocateInstr()
add2.asXmmRmR(sseOpcodePaddq, newOperandReg(tmp3), tmp1)
m.insert(add2)
m.copyTo(tmp1, rd)
} else {
var vecOp sseOpcode
switch lane {
case ssa.VecLaneI16x8:
vecOp = sseOpcodePmullw
case ssa.VecLaneI32x4:
vecOp = sseOpcodePmulld
default:
panic("unsupported: " + lane.String())
}
m.lowerVbBinOp(vecOp, x, y, instr.Return())
}
}

View File

@ -0,0 +1,346 @@
package amd64
import (
"fmt"
"unsafe"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
type operand struct {
kind operandKind
data uint64
}
type operandKind byte
const (
// operandKindReg is an operand which is an integer Register.
operandKindReg operandKind = iota + 1
// operandKindMem is a value in Memory.
// 32, 64, or 128 bit value.
operandKindMem
// operandKindImm32 is a signed-32-bit integer immediate value.
operandKindImm32
// operandKindLabel is a label.
operandKindLabel
)
// String implements fmt.Stringer.
func (o operandKind) String() string {
switch o {
case operandKindReg:
return "reg"
case operandKindMem:
return "mem"
case operandKindImm32:
return "imm32"
case operandKindLabel:
return "label"
default:
panic("BUG: invalid operand kind")
}
}
// format returns the string representation of the operand.
// _64 is only for the case where the operand is a register, and it's integer.
func (o *operand) format(_64 bool) string {
switch o.kind {
case operandKindReg:
return formatVRegSized(o.reg(), _64)
case operandKindMem:
return o.addressMode().String()
case operandKindImm32:
return fmt.Sprintf("$%d", int32(o.imm32()))
case operandKindLabel:
return backend.Label(o.imm32()).String()
default:
panic(fmt.Sprintf("BUG: invalid operand: %s", o.kind))
}
}
//go:inline
func (o *operand) reg() regalloc.VReg {
return regalloc.VReg(o.data)
}
//go:inline
func (o *operand) setReg(r regalloc.VReg) {
o.data = uint64(r)
}
//go:inline
func (o *operand) addressMode() *amode {
return wazevoapi.PtrFromUintptr[amode](uintptr(o.data))
}
//go:inline
func (o *operand) imm32() uint32 {
return uint32(o.data)
}
func (o *operand) label() backend.Label {
switch o.kind {
case operandKindLabel:
return backend.Label(o.data)
case operandKindMem:
mem := o.addressMode()
if mem.kind() != amodeRipRel {
panic("BUG: invalid label")
}
return backend.Label(mem.imm32)
default:
panic("BUG: invalid operand kind")
}
}
func newOperandLabel(label backend.Label) operand {
return operand{kind: operandKindLabel, data: uint64(label)}
}
func newOperandReg(r regalloc.VReg) operand {
return operand{kind: operandKindReg, data: uint64(r)}
}
func newOperandImm32(imm32 uint32) operand {
return operand{kind: operandKindImm32, data: uint64(imm32)}
}
func newOperandMem(amode *amode) operand {
return operand{kind: operandKindMem, data: uint64(uintptr(unsafe.Pointer(amode)))}
}
// amode is a memory operand (addressing mode).
type amode struct {
kindWithShift uint32
imm32 uint32
base regalloc.VReg
// For amodeRegRegShift:
index regalloc.VReg
}
type amodeKind byte
const (
// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base
amodeImmReg amodeKind = iota + 1
// amodeImmRBP is the same as amodeImmReg, but the base register is fixed to RBP.
// The only differece is that it doesn't tell the register allocator to use RBP which is distracting for the
// register allocator.
amodeImmRBP
// amodeRegRegShift calculates sign-extend-32-to-64(Immediate) + base + (Register2 << Shift)
amodeRegRegShift
// amodeRipRel is a RIP-relative addressing mode specified by the label.
amodeRipRel
// TODO: there are other addressing modes such as the one without base register.
)
func (a *amode) kind() amodeKind {
return amodeKind(a.kindWithShift & 0xff)
}
func (a *amode) shift() byte {
return byte(a.kindWithShift >> 8)
}
func (a *amode) uses(rs *[]regalloc.VReg) {
switch a.kind() {
case amodeImmReg:
*rs = append(*rs, a.base)
case amodeRegRegShift:
*rs = append(*rs, a.base, a.index)
case amodeImmRBP, amodeRipRel:
default:
panic("BUG: invalid amode kind")
}
}
func (a *amode) nregs() int {
switch a.kind() {
case amodeImmReg:
return 1
case amodeRegRegShift:
return 2
case amodeImmRBP, amodeRipRel:
return 0
default:
panic("BUG: invalid amode kind")
}
}
func (a *amode) assignUses(i int, reg regalloc.VReg) {
switch a.kind() {
case amodeImmReg:
if i == 0 {
a.base = reg
} else {
panic("BUG: invalid amode assignment")
}
case amodeRegRegShift:
if i == 0 {
a.base = reg
} else if i == 1 {
a.index = reg
} else {
panic("BUG: invalid amode assignment")
}
default:
panic("BUG: invalid amode assignment")
}
}
func (m *machine) newAmodeImmReg(imm32 uint32, base regalloc.VReg) *amode {
ret := m.amodePool.Allocate()
*ret = amode{kindWithShift: uint32(amodeImmReg), imm32: imm32, base: base}
return ret
}
func (m *machine) newAmodeImmRBPReg(imm32 uint32) *amode {
ret := m.amodePool.Allocate()
*ret = amode{kindWithShift: uint32(amodeImmRBP), imm32: imm32, base: rbpVReg}
return ret
}
func (m *machine) newAmodeRegRegShift(imm32 uint32, base, index regalloc.VReg, shift byte) *amode {
if shift > 3 {
panic(fmt.Sprintf("BUG: invalid shift (must be 3>=): %d", shift))
}
ret := m.amodePool.Allocate()
*ret = amode{kindWithShift: uint32(amodeRegRegShift) | uint32(shift)<<8, imm32: imm32, base: base, index: index}
return ret
}
func (m *machine) newAmodeRipRel(label backend.Label) *amode {
ret := m.amodePool.Allocate()
*ret = amode{kindWithShift: uint32(amodeRipRel), imm32: uint32(label)}
return ret
}
// String implements fmt.Stringer.
func (a *amode) String() string {
switch a.kind() {
case amodeImmReg, amodeImmRBP:
if a.imm32 == 0 {
return fmt.Sprintf("(%s)", formatVRegSized(a.base, true))
}
return fmt.Sprintf("%d(%s)", int32(a.imm32), formatVRegSized(a.base, true))
case amodeRegRegShift:
shift := 1 << a.shift()
if a.imm32 == 0 {
return fmt.Sprintf(
"(%s,%s,%d)",
formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
}
return fmt.Sprintf(
"%d(%s,%s,%d)",
int32(a.imm32), formatVRegSized(a.base, true), formatVRegSized(a.index, true), shift)
case amodeRipRel:
return fmt.Sprintf("%s(%%rip)", backend.Label(a.imm32))
default:
panic("BUG: invalid amode kind")
}
}
func (m *machine) getOperand_Mem_Reg(def *backend.SSAValueDefinition) (op operand) {
if def.IsFromBlockParam() {
return newOperandReg(def.BlkParamVReg)
}
if def.SSAValue().Type() == ssa.TypeV128 {
// SIMD instructions require strict memory alignment, so we don't support the memory operand for V128 at the moment.
return m.getOperand_Reg(def)
}
if m.c.MatchInstr(def, ssa.OpcodeLoad) {
instr := def.Instr
ptr, offset, _ := instr.LoadData()
op = newOperandMem(m.lowerToAddressMode(ptr, offset))
instr.MarkLowered()
return op
}
return m.getOperand_Reg(def)
}
func (m *machine) getOperand_Mem_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
if def.IsFromBlockParam() {
return newOperandReg(def.BlkParamVReg)
}
if m.c.MatchInstr(def, ssa.OpcodeLoad) {
instr := def.Instr
ptr, offset, _ := instr.LoadData()
op = newOperandMem(m.lowerToAddressMode(ptr, offset))
instr.MarkLowered()
return op
}
return m.getOperand_Imm32_Reg(def)
}
func (m *machine) getOperand_Imm32_Reg(def *backend.SSAValueDefinition) (op operand) {
if def.IsFromBlockParam() {
return newOperandReg(def.BlkParamVReg)
}
instr := def.Instr
if instr.Constant() {
// If the operation is 64-bit, x64 sign-extends the 32-bit immediate value.
// Therefore, we need to check if the immediate value is within the 32-bit range and if the sign bit is set,
// we should not use the immediate value.
if op, ok := asImm32Operand(instr.ConstantVal(), instr.Return().Type() == ssa.TypeI32); ok {
instr.MarkLowered()
return op
}
}
return m.getOperand_Reg(def)
}
func asImm32Operand(val uint64, allowSignExt bool) (operand, bool) {
if imm32, ok := asImm32(val, allowSignExt); ok {
return newOperandImm32(imm32), true
}
return operand{}, false
}
func asImm32(val uint64, allowSignExt bool) (uint32, bool) {
u32val := uint32(val)
if uint64(u32val) != val {
return 0, false
}
if !allowSignExt && u32val&0x80000000 != 0 {
return 0, false
}
return u32val, true
}
func (m *machine) getOperand_Reg(def *backend.SSAValueDefinition) (op operand) {
var v regalloc.VReg
if def.IsFromBlockParam() {
v = def.BlkParamVReg
} else {
instr := def.Instr
if instr.Constant() {
// We inline all the constant instructions so that we could reduce the register usage.
v = m.lowerConstant(instr)
instr.MarkLowered()
} else {
if n := def.N; n == 0 {
v = m.c.VRegOf(instr.Return())
} else {
_, rs := instr.Returns()
v = m.c.VRegOf(rs[n-1])
}
}
}
return newOperandReg(v)
}

View File

@ -0,0 +1,11 @@
//go:build !tinygo
package amd64
import "reflect"
// setSliceLimits sets both Cap and Len for the given reflected slice.
func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
s.Len = int(limit)
s.Cap = int(limit)
}

View File

@ -0,0 +1,11 @@
//go:build tinygo
package amd64
import "reflect"
// setSliceLimits sets both Cap and Len for the given reflected slice.
func setSliceLimits(s *reflect.SliceHeader, limit uintptr) {
s.Len = limit
s.Len = limit
}

View File

@ -0,0 +1,181 @@
package amd64
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
)
// Amd64-specific registers.
const (
// rax is a gp register.
rax = regalloc.RealRegInvalid + 1 + iota
// rcx is a gp register.
rcx
// rdx is a gp register.
rdx
// rbx is a gp register.
rbx
// rsp is a gp register.
rsp
// rbp is a gp register.
rbp
// rsi is a gp register.
rsi
// rdi is a gp register.
rdi
// r8 is a gp register.
r8
// r9 is a gp register.
r9
// r10 is a gp register.
r10
// r11 is a gp register.
r11
// r12 is a gp register.
r12
// r13 is a gp register.
r13
// r14 is a gp register.
r14
// r15 is a gp register.
r15
// xmm0 is a vector register.
xmm0
// xmm1 is a vector register.
xmm1
// xmm2 is a vector register.
xmm2
// xmm3 is a vector register.
xmm3
// xmm4 is a vector register.
xmm4
// xmm5 is a vector register.
xmm5
// xmm6 is a vector register.
xmm6
// xmm7 is a vector register.
xmm7
// xmm8 is a vector register.
xmm8
// xmm9 is a vector register.
xmm9
// xmm10 is a vector register.
xmm10
// xmm11 is a vector register.
xmm11
// xmm12 is a vector register.
xmm12
// xmm13 is a vector register.
xmm13
// xmm14 is a vector register.
xmm14
// xmm15 is a vector register.
xmm15
)
var (
raxVReg = regalloc.FromRealReg(rax, regalloc.RegTypeInt)
rcxVReg = regalloc.FromRealReg(rcx, regalloc.RegTypeInt)
rdxVReg = regalloc.FromRealReg(rdx, regalloc.RegTypeInt)
rbxVReg = regalloc.FromRealReg(rbx, regalloc.RegTypeInt)
rspVReg = regalloc.FromRealReg(rsp, regalloc.RegTypeInt)
rbpVReg = regalloc.FromRealReg(rbp, regalloc.RegTypeInt)
rsiVReg = regalloc.FromRealReg(rsi, regalloc.RegTypeInt)
rdiVReg = regalloc.FromRealReg(rdi, regalloc.RegTypeInt)
r8VReg = regalloc.FromRealReg(r8, regalloc.RegTypeInt)
r9VReg = regalloc.FromRealReg(r9, regalloc.RegTypeInt)
r10VReg = regalloc.FromRealReg(r10, regalloc.RegTypeInt)
r11VReg = regalloc.FromRealReg(r11, regalloc.RegTypeInt)
r12VReg = regalloc.FromRealReg(r12, regalloc.RegTypeInt)
r13VReg = regalloc.FromRealReg(r13, regalloc.RegTypeInt)
r14VReg = regalloc.FromRealReg(r14, regalloc.RegTypeInt)
r15VReg = regalloc.FromRealReg(r15, regalloc.RegTypeInt)
xmm0VReg = regalloc.FromRealReg(xmm0, regalloc.RegTypeFloat)
xmm1VReg = regalloc.FromRealReg(xmm1, regalloc.RegTypeFloat)
xmm2VReg = regalloc.FromRealReg(xmm2, regalloc.RegTypeFloat)
xmm3VReg = regalloc.FromRealReg(xmm3, regalloc.RegTypeFloat)
xmm4VReg = regalloc.FromRealReg(xmm4, regalloc.RegTypeFloat)
xmm5VReg = regalloc.FromRealReg(xmm5, regalloc.RegTypeFloat)
xmm6VReg = regalloc.FromRealReg(xmm6, regalloc.RegTypeFloat)
xmm7VReg = regalloc.FromRealReg(xmm7, regalloc.RegTypeFloat)
xmm8VReg = regalloc.FromRealReg(xmm8, regalloc.RegTypeFloat)
xmm9VReg = regalloc.FromRealReg(xmm9, regalloc.RegTypeFloat)
xmm10VReg = regalloc.FromRealReg(xmm10, regalloc.RegTypeFloat)
xmm11VReg = regalloc.FromRealReg(xmm11, regalloc.RegTypeFloat)
xmm12VReg = regalloc.FromRealReg(xmm12, regalloc.RegTypeFloat)
xmm13VReg = regalloc.FromRealReg(xmm13, regalloc.RegTypeFloat)
xmm14VReg = regalloc.FromRealReg(xmm14, regalloc.RegTypeFloat)
xmm15VReg = regalloc.FromRealReg(xmm15, regalloc.RegTypeFloat)
)
var regNames = [...]string{
rax: "rax",
rcx: "rcx",
rdx: "rdx",
rbx: "rbx",
rsp: "rsp",
rbp: "rbp",
rsi: "rsi",
rdi: "rdi",
r8: "r8",
r9: "r9",
r10: "r10",
r11: "r11",
r12: "r12",
r13: "r13",
r14: "r14",
r15: "r15",
xmm0: "xmm0",
xmm1: "xmm1",
xmm2: "xmm2",
xmm3: "xmm3",
xmm4: "xmm4",
xmm5: "xmm5",
xmm6: "xmm6",
xmm7: "xmm7",
xmm8: "xmm8",
xmm9: "xmm9",
xmm10: "xmm10",
xmm11: "xmm11",
xmm12: "xmm12",
xmm13: "xmm13",
xmm14: "xmm14",
xmm15: "xmm15",
}
func formatVRegSized(r regalloc.VReg, _64 bool) string {
if r.IsRealReg() {
if r.RegType() == regalloc.RegTypeInt {
rr := r.RealReg()
orig := regNames[rr]
if rr <= rdi {
if _64 {
return "%" + orig
} else {
return "%e" + orig[1:]
}
} else {
if _64 {
return "%" + orig
} else {
return "%" + orig + "d"
}
}
} else {
return "%" + regNames[r.RealReg()]
}
} else {
if r.RegType() == regalloc.RegTypeInt {
if _64 {
return fmt.Sprintf("%%r%d?", r.ID())
} else {
return fmt.Sprintf("%%r%dd?", r.ID())
}
} else {
return fmt.Sprintf("%%xmm%d?", r.ID())
}
}
}

View File

@ -0,0 +1,128 @@
package amd64
import (
"encoding/binary"
"reflect"
"unsafe"
"github.com/tetratelabs/wazero/internal/wasmdebug"
)
func stackView(rbp, top uintptr) []byte {
var stackBuf []byte
{
// TODO: use unsafe.Slice after floor version is set to Go 1.20.
hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
hdr.Data = rbp
setSliceLimits(hdr, top-rbp)
}
return stackBuf
}
// UnwindStack implements wazevo.unwindStack.
func UnwindStack(_, rbp, top uintptr, returnAddresses []uintptr) []uintptr {
stackBuf := stackView(rbp, top)
for i := uint64(0); i < uint64(len(stackBuf)); {
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | ReturnAddress |
// | Caller_RBP |
// +-----------------+ <---- Caller_RBP
// | ........... |
// | clobbered M |
// | ............ |
// | clobbered 0 |
// | spill slot N |
// | ............ |
// | spill slot 0 |
// | ReturnAddress |
// | Caller_RBP |
// +-----------------+ <---- RBP
// (low address)
callerRBP := binary.LittleEndian.Uint64(stackBuf[i:])
retAddr := binary.LittleEndian.Uint64(stackBuf[i+8:])
returnAddresses = append(returnAddresses, uintptr(retAddr))
i = callerRBP - uint64(rbp)
if len(returnAddresses) == wasmdebug.MaxFrames {
break
}
}
return returnAddresses
}
// GoCallStackView implements wazevo.goCallStackView.
func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
// (high address)
// +-----------------+ <----+
// | xxxxxxxxxxx | | ;; optional unused space to make it 16-byte aligned.
// ^ | arg[N]/ret[M] | |
// sliceSize | | ............ | | SizeInBytes/8
// | | arg[1]/ret[1] | |
// v | arg[0]/ret[0] | <----+
// | SizeInBytes |
// +-----------------+ <---- stackPointerBeforeGoCall
// (low address)
data := unsafe.Pointer(uintptr(unsafe.Pointer(stackPointerBeforeGoCall)) + 8)
size := *stackPointerBeforeGoCall / 8
return unsafe.Slice((*uint64)(data), int(size))
}
func AdjustClonedStack(oldRsp, oldTop, rsp, rbp, top uintptr) {
diff := uint64(rsp - oldRsp)
newBuf := stackView(rbp, top)
for i := uint64(0); i < uint64(len(newBuf)); {
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | ReturnAddress |
// | Caller_RBP |
// +-----------------+ <---- Caller_RBP
// | ........... |
// | clobbered M |
// | ............ |
// | clobbered 0 |
// | spill slot N |
// | ............ |
// | spill slot 0 |
// | ReturnAddress |
// | Caller_RBP |
// +-----------------+ <---- RBP
// (low address)
callerRBP := binary.LittleEndian.Uint64(newBuf[i:])
if callerRBP == 0 {
// End of stack.
break
}
if i64 := int64(callerRBP); i64 < int64(oldRsp) || i64 >= int64(oldTop) {
panic("BUG: callerRBP is out of range")
}
if int(callerRBP) < 0 {
panic("BUG: callerRBP is negative")
}
adjustedCallerRBP := callerRBP + diff
if int(adjustedCallerRBP) < 0 {
panic("BUG: adjustedCallerRBP is negative")
}
binary.LittleEndian.PutUint64(newBuf[i:], adjustedCallerRBP)
i = adjustedCallerRBP - uint64(rbp)
}
}

View File

@ -0,0 +1,332 @@
package arm64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// References:
// * https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#arm64-architecture
// * https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
var (
intParamResultRegs = []regalloc.RealReg{x0, x1, x2, x3, x4, x5, x6, x7}
floatParamResultRegs = []regalloc.RealReg{v0, v1, v2, v3, v4, v5, v6, v7}
)
var regInfo = &regalloc.RegisterInfo{
AllocatableRegisters: [regalloc.NumRegType][]regalloc.RealReg{
// We don't allocate:
// - x18: Reserved by the macOS: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Respect-the-purpose-of-specific-CPU-registers
// - x28: Reserved by Go runtime.
// - x27(=tmpReg): because of the reason described on tmpReg.
regalloc.RegTypeInt: {
x8, x9, x10, x11, x12, x13, x14, x15,
x16, x17, x19, x20, x21, x22, x23, x24, x25,
x26, x29, x30,
// These are the argument/return registers. Less preferred in the allocation.
x7, x6, x5, x4, x3, x2, x1, x0,
},
regalloc.RegTypeFloat: {
v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30,
// These are the argument/return registers. Less preferred in the allocation.
v7, v6, v5, v4, v3, v2, v1, v0,
},
},
CalleeSavedRegisters: regalloc.NewRegSet(
x19, x20, x21, x22, x23, x24, x25, x26, x28,
v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
),
CallerSavedRegisters: regalloc.NewRegSet(
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x29, x30,
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
),
RealRegToVReg: []regalloc.VReg{
x0: x0VReg, x1: x1VReg, x2: x2VReg, x3: x3VReg, x4: x4VReg, x5: x5VReg, x6: x6VReg, x7: x7VReg, x8: x8VReg, x9: x9VReg, x10: x10VReg, x11: x11VReg, x12: x12VReg, x13: x13VReg, x14: x14VReg, x15: x15VReg, x16: x16VReg, x17: x17VReg, x18: x18VReg, x19: x19VReg, x20: x20VReg, x21: x21VReg, x22: x22VReg, x23: x23VReg, x24: x24VReg, x25: x25VReg, x26: x26VReg, x27: x27VReg, x28: x28VReg, x29: x29VReg, x30: x30VReg,
v0: v0VReg, v1: v1VReg, v2: v2VReg, v3: v3VReg, v4: v4VReg, v5: v5VReg, v6: v6VReg, v7: v7VReg, v8: v8VReg, v9: v9VReg, v10: v10VReg, v11: v11VReg, v12: v12VReg, v13: v13VReg, v14: v14VReg, v15: v15VReg, v16: v16VReg, v17: v17VReg, v18: v18VReg, v19: v19VReg, v20: v20VReg, v21: v21VReg, v22: v22VReg, v23: v23VReg, v24: v24VReg, v25: v25VReg, v26: v26VReg, v27: v27VReg, v28: v28VReg, v29: v29VReg, v30: v30VReg, v31: v31VReg,
},
RealRegName: func(r regalloc.RealReg) string { return regNames[r] },
RealRegType: func(r regalloc.RealReg) regalloc.RegType {
if r < v0 {
return regalloc.RegTypeInt
}
return regalloc.RegTypeFloat
},
}
// ArgsResultsRegs implements backend.Machine.
func (m *machine) ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg) {
return intParamResultRegs, floatParamResultRegs
}
// LowerParams implements backend.FunctionABI.
func (m *machine) LowerParams(args []ssa.Value) {
a := m.currentABI
for i, ssaArg := range args {
if !ssaArg.Valid() {
continue
}
reg := m.compiler.VRegOf(ssaArg)
arg := &a.Args[i]
if arg.Kind == backend.ABIArgKindReg {
m.InsertMove(reg, arg.Reg, arg.Type)
} else {
// TODO: we could use pair load if there's consecutive loads for the same type.
//
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 | <-|
// | ReturnAddress | |
// +-----------------+ |
// | ........... | |
// | clobbered M | | argStackOffset: is unknown at this point of compilation.
// | ............ | |
// | clobbered 0 | |
// | spill slot N | |
// | ........... | |
// | spill slot 0 | |
// SP---> +-----------------+ <-+
// (low address)
bits := arg.Type.Bits()
// At this point of compilation, we don't yet know how much space exist below the return address.
// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
load := m.allocateInstr()
switch arg.Type {
case ssa.TypeI32, ssa.TypeI64:
load.asULoad(operandNR(reg), amode, bits)
case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
load.asFpuLoad(operandNR(reg), amode, bits)
default:
panic("BUG")
}
m.insert(load)
m.unresolvedAddressModes = append(m.unresolvedAddressModes, load)
}
}
}
// LowerReturns lowers the given returns.
func (m *machine) LowerReturns(rets []ssa.Value) {
a := m.currentABI
l := len(rets) - 1
for i := range rets {
// Reverse order in order to avoid overwriting the stack returns existing in the return registers.
ret := rets[l-i]
r := &a.Rets[l-i]
reg := m.compiler.VRegOf(ret)
if def := m.compiler.ValueDefinition(ret); def.IsFromInstr() {
// Constant instructions are inlined.
if inst := def.Instr; inst.Constant() {
val := inst.Return()
valType := val.Type()
v := inst.ConstantVal()
m.insertLoadConstant(v, valType, reg)
}
}
if r.Kind == backend.ABIArgKindReg {
m.InsertMove(r.Reg, reg, ret.Type())
} else {
// TODO: we could use pair store if there's consecutive stores for the same type.
//
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 | <-+
// | arg X | |
// | ....... | |
// | arg 1 | |
// | arg 0 | |
// | ReturnAddress | |
// +-----------------+ |
// | ........... | |
// | spill slot M | | retStackOffset: is unknown at this point of compilation.
// | ............ | |
// | spill slot 2 | |
// | spill slot 1 | |
// | clobbered 0 | |
// | clobbered 1 | |
// | ........... | |
// | clobbered N | |
// SP---> +-----------------+ <-+
// (low address)
bits := r.Type.Bits()
// At this point of compilation, we don't yet know how much space exist below the return address.
// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
store := m.allocateInstr()
store.asStore(operandNR(reg), amode, bits)
m.insert(store)
m.unresolvedAddressModes = append(m.unresolvedAddressModes, store)
}
}
}
// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
// caller side of the function call.
func (m *machine) callerGenVRegToFunctionArg(a *backend.FunctionABI, argIndex int, reg regalloc.VReg, def *backend.SSAValueDefinition, slotBegin int64) {
arg := &a.Args[argIndex]
if def != nil && def.IsFromInstr() {
// Constant instructions are inlined.
if inst := def.Instr; inst.Constant() {
val := inst.Return()
valType := val.Type()
v := inst.ConstantVal()
m.insertLoadConstant(v, valType, reg)
}
}
if arg.Kind == backend.ABIArgKindReg {
m.InsertMove(arg.Reg, reg, arg.Type)
} else {
// TODO: we could use pair store if there's consecutive stores for the same type.
//
// Note that at this point, stack pointer is already adjusted.
bits := arg.Type.Bits()
amode := m.resolveAddressModeForOffset(arg.Offset-slotBegin, bits, spVReg, false)
store := m.allocateInstr()
store.asStore(operandNR(reg), amode, bits)
m.insert(store)
}
}
func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex int, reg regalloc.VReg, slotBegin int64) {
r := &a.Rets[retIndex]
if r.Kind == backend.ABIArgKindReg {
m.InsertMove(reg, r.Reg, r.Type)
} else {
// TODO: we could use pair load if there's consecutive loads for the same type.
amode := m.resolveAddressModeForOffset(a.ArgStackSize+r.Offset-slotBegin, r.Type.Bits(), spVReg, false)
ldr := m.allocateInstr()
switch r.Type {
case ssa.TypeI32, ssa.TypeI64:
ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
default:
panic("BUG")
}
m.insert(ldr)
}
}
func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
exct := m.executableContext
exct.PendingInstructions = exct.PendingInstructions[:0]
mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
for _, instr := range exct.PendingInstructions {
cur = linkInstr(cur, instr)
}
return cur, mode
}
func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
if rn.RegType() != regalloc.RegTypeInt {
panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
}
var amode addressMode
if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
} else {
var indexReg regalloc.VReg
if allowTmpRegUse {
m.lowerConstantI64(tmpRegVReg, offset)
indexReg = tmpRegVReg
} else {
indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
m.lowerConstantI64(indexReg, offset)
}
amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
}
return amode
}
func (m *machine) lowerCall(si *ssa.Instruction) {
isDirectCall := si.Opcode() == ssa.OpcodeCall
var indirectCalleePtr ssa.Value
var directCallee ssa.FuncRef
var sigID ssa.SignatureID
var args []ssa.Value
if isDirectCall {
directCallee, sigID, args = si.CallData()
} else {
indirectCalleePtr, sigID, args, _ /* on arm64, the calling convention is compatible with the Go runtime */ = si.CallIndirectData()
}
calleeABI := m.compiler.GetFunctionABI(m.compiler.SSABuilder().ResolveSignature(sigID))
stackSlotSize := int64(calleeABI.AlignedArgResultStackSlotSize())
if m.maxRequiredStackSizeForCalls < stackSlotSize+16 {
m.maxRequiredStackSizeForCalls = stackSlotSize + 16 // return address frame.
}
for i, arg := range args {
reg := m.compiler.VRegOf(arg)
def := m.compiler.ValueDefinition(arg)
m.callerGenVRegToFunctionArg(calleeABI, i, reg, def, stackSlotSize)
}
if isDirectCall {
call := m.allocateInstr()
call.asCall(directCallee, calleeABI)
m.insert(call)
} else {
ptr := m.compiler.VRegOf(indirectCalleePtr)
callInd := m.allocateInstr()
callInd.asCallIndirect(ptr, calleeABI)
m.insert(callInd)
}
var index int
r1, rs := si.Returns()
if r1.Valid() {
m.callerGenFunctionReturnVReg(calleeABI, 0, m.compiler.VRegOf(r1), stackSlotSize)
index++
}
for _, r := range rs {
m.callerGenFunctionReturnVReg(calleeABI, index, m.compiler.VRegOf(r), stackSlotSize)
index++
}
}
func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add bool) {
if imm12Operand, ok := asImm12Operand(uint64(diff)); ok {
alu := m.allocateInstr()
var ao aluOp
if add {
ao = aluOpAdd
} else {
ao = aluOpSub
}
alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
m.insert(alu)
} else {
m.lowerConstantI64(tmpRegVReg, diff)
alu := m.allocateInstr()
var ao aluOp
if add {
ao = aluOpAdd
} else {
ao = aluOpSub
}
alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
m.insert(alu)
}
}

View File

@ -0,0 +1,9 @@
package arm64
// entrypoint enters the machine code generated by this backend which begins with the preamble generated by functionABI.EmitGoEntryPreamble below.
// This implements wazevo.entrypoint, and see the comments there for detail.
func entrypoint(preambleExecutable, functionExecutable *byte, executionContextPtr uintptr, moduleContextPtr *byte, paramResultPtr *uint64, goAllocatedStackSlicePtr uintptr)
// afterGoFunctionCallEntrypoint enters the machine code after growing the stack.
// This implements wazevo.afterGoFunctionCallEntrypoint, and see the comments there for detail.
func afterGoFunctionCallEntrypoint(executable *byte, executionContextPtr uintptr, stackPointer, framePointer uintptr)

View File

@ -0,0 +1,29 @@
//go:build arm64
#include "funcdata.h"
#include "textflag.h"
// See the comments on EmitGoEntryPreamble for what this function is supposed to do.
TEXT ·entrypoint(SB), NOSPLIT|NOFRAME, $0-48
MOVD preambleExecutable+0(FP), R27
MOVD functionExectuable+8(FP), R24
MOVD executionContextPtr+16(FP), R0
MOVD moduleContextPtr+24(FP), R1
MOVD paramResultSlicePtr+32(FP), R19
MOVD goAllocatedStackSlicePtr+40(FP), R26
JMP (R27)
TEXT ·afterGoFunctionCallEntrypoint(SB), NOSPLIT|NOFRAME, $0-32
MOVD goCallReturnAddress+0(FP), R20
MOVD executionContextPtr+8(FP), R0
MOVD stackPointer+16(FP), R19
// Save the current FP(R29), SP and LR(R30) into the wazevo.executionContext (stored in R0).
MOVD R29, 16(R0) // Store FP(R29) into [RO, #ExecutionContextOffsets.OriginalFramePointer]
MOVD RSP, R27 // Move SP to R27 (temporary register) since SP cannot be stored directly in str instructions.
MOVD R27, 24(R0) // Store R27 into [RO, #ExecutionContextOffsets.OriginalFramePointer]
MOVD R30, 32(R0) // Store R30 into [R0, #ExecutionContextOffsets.GoReturnAddress]
// Load the new stack pointer (which sits somewhere in Go-allocated stack) into SP.
MOVD R19, RSP
JMP (R20)

View File

@ -0,0 +1,230 @@
package arm64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
// CompileEntryPreamble implements backend.Machine. This assumes `entrypoint` function (in abi_go_entry_arm64.s) passes:
//
// 1. First (execution context ptr) and Second arguments are already passed in x0, and x1.
// 2. param/result slice ptr in x19; the pointer to []uint64{} which is used to pass arguments and accept return values.
// 3. Go-allocated stack slice ptr in x26.
// 4. Function executable in x24.
//
// also SP and FP are correct Go-runtime-based values, and LR is the return address to the Go-side caller.
func (m *machine) CompileEntryPreamble(signature *ssa.Signature) []byte {
root := m.constructEntryPreamble(signature)
m.encode(root)
return m.compiler.Buf()
}
var (
executionContextPtrReg = x0VReg
// callee-saved regs so that they can be used in the prologue and epilogue.
paramResultSlicePtr = x19VReg
savedExecutionContextPtr = x20VReg
// goAllocatedStackPtr is not used in the epilogue.
goAllocatedStackPtr = x26VReg
// paramResultSliceCopied is not used in the epilogue.
paramResultSliceCopied = x25VReg
// tmpRegVReg is not used in the epilogue.
functionExecutable = x24VReg
)
func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regalloc.VReg, arg *backend.ABIArg, argStartOffsetFromSP int64) *instruction {
typ := arg.Type
bits := typ.Bits()
isStackArg := arg.Kind == backend.ABIArgKindStack
var loadTargetReg operand
if !isStackArg {
loadTargetReg = operandNR(arg.Reg)
} else {
switch typ {
case ssa.TypeI32, ssa.TypeI64:
loadTargetReg = operandNR(x15VReg)
case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
loadTargetReg = operandNR(v15VReg)
default:
panic("TODO?")
}
}
var postIndexImm int64
if typ == ssa.TypeV128 {
postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
} else {
postIndexImm = 8
}
loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
instr := m.allocateInstr()
switch typ {
case ssa.TypeI32:
instr.asULoad(loadTargetReg, loadMode, 32)
case ssa.TypeI64:
instr.asULoad(loadTargetReg, loadMode, 64)
case ssa.TypeF32:
instr.asFpuLoad(loadTargetReg, loadMode, 32)
case ssa.TypeF64:
instr.asFpuLoad(loadTargetReg, loadMode, 64)
case ssa.TypeV128:
instr.asFpuLoad(loadTargetReg, loadMode, 128)
}
cur = linkInstr(cur, instr)
if isStackArg {
var storeMode addressMode
cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
toStack := m.allocateInstr()
toStack.asStore(loadTargetReg, storeMode, bits)
cur = linkInstr(cur, toStack)
}
return cur
}
func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr regalloc.VReg, result *backend.ABIArg, resultStartOffsetFromSP int64) *instruction {
isStackArg := result.Kind == backend.ABIArgKindStack
typ := result.Type
bits := typ.Bits()
var storeTargetReg operand
if !isStackArg {
storeTargetReg = operandNR(result.Reg)
} else {
switch typ {
case ssa.TypeI32, ssa.TypeI64:
storeTargetReg = operandNR(x15VReg)
case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
storeTargetReg = operandNR(v15VReg)
default:
panic("TODO?")
}
}
var postIndexImm int64
if typ == ssa.TypeV128 {
postIndexImm = 16 // v128 is represented as 2x64-bit in Go slice.
} else {
postIndexImm = 8
}
if isStackArg {
var loadMode addressMode
cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
toReg := m.allocateInstr()
switch typ {
case ssa.TypeI32, ssa.TypeI64:
toReg.asULoad(storeTargetReg, loadMode, bits)
case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
toReg.asFpuLoad(storeTargetReg, loadMode, bits)
default:
panic("TODO?")
}
cur = linkInstr(cur, toReg)
}
mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
instr := m.allocateInstr()
instr.asStore(storeTargetReg, mode, bits)
cur = linkInstr(cur, instr)
return cur
}
func (m *machine) constructEntryPreamble(sig *ssa.Signature) (root *instruction) {
abi := backend.FunctionABI{}
abi.Init(sig, intParamResultRegs, floatParamResultRegs)
root = m.allocateNop()
//// ----------------------------------- prologue ----------------------------------- ////
// First, we save executionContextPtrReg into a callee-saved register so that it can be used in epilogue as well.
// mov savedExecutionContextPtr, x0
cur := m.move64(savedExecutionContextPtr, executionContextPtrReg, root)
// Next, save the current FP, SP and LR into the wazevo.executionContext:
// str fp, [savedExecutionContextPtr, #OriginalFramePointer]
// mov tmp, sp ;; sp cannot be str'ed directly.
// str sp, [savedExecutionContextPtr, #OriginalStackPointer]
// str lr, [savedExecutionContextPtr, #GoReturnAddress]
cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, true, cur)
cur = m.move64(tmpRegVReg, spVReg, cur)
cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, true, cur)
cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, true, cur)
// Then, move the Go-allocated stack pointer to SP:
// mov sp, goAllocatedStackPtr
cur = m.move64(spVReg, goAllocatedStackPtr, cur)
prReg := paramResultSlicePtr
if len(abi.Args) > 2 && len(abi.Rets) > 0 {
// paramResultSlicePtr is modified during the execution of goEntryPreamblePassArg,
// so copy it to another reg.
cur = m.move64(paramResultSliceCopied, paramResultSlicePtr, cur)
prReg = paramResultSliceCopied
}
stackSlotSize := int64(abi.AlignedArgResultStackSlotSize())
for i := range abi.Args {
if i < 2 {
// module context ptr and execution context ptr are passed in x0 and x1 by the Go assembly function.
continue
}
arg := &abi.Args[i]
cur = m.goEntryPreamblePassArg(cur, prReg, arg, -stackSlotSize)
}
// Call the real function.
bl := m.allocateInstr()
bl.asCallIndirect(functionExecutable, &abi)
cur = linkInstr(cur, bl)
///// ----------------------------------- epilogue ----------------------------------- /////
// Store the register results into paramResultSlicePtr.
for i := range abi.Rets {
cur = m.goEntryPreamblePassResult(cur, paramResultSlicePtr, &abi.Rets[i], abi.ArgStackSize-stackSlotSize)
}
// Finally, restore the FP, SP and LR, and return to the Go code.
// ldr fp, [savedExecutionContextPtr, #OriginalFramePointer]
// ldr tmp, [savedExecutionContextPtr, #OriginalStackPointer]
// mov sp, tmp ;; sp cannot be str'ed directly.
// ldr lr, [savedExecutionContextPtr, #GoReturnAddress]
// ret ;; --> return to the Go code
cur = m.loadOrStoreAtExecutionContext(fpVReg, wazevoapi.ExecutionContextOffsetOriginalFramePointer, false, cur)
cur = m.loadOrStoreAtExecutionContext(tmpRegVReg, wazevoapi.ExecutionContextOffsetOriginalStackPointer, false, cur)
cur = m.move64(spVReg, tmpRegVReg, cur)
cur = m.loadOrStoreAtExecutionContext(lrVReg, wazevoapi.ExecutionContextOffsetGoReturnAddress, false, cur)
retInst := m.allocateInstr()
retInst.asRet()
linkInstr(cur, retInst)
return
}
func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction {
instr := m.allocateInstr()
instr.asMove64(dst, src)
return linkInstr(prev, instr)
}
func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
instr := m.allocateInstr()
mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
if store {
instr.asStore(operandNR(d), mode, 64)
} else {
instr.asULoad(operandNR(d), mode, 64)
}
return linkInstr(prev, instr)
}
func linkInstr(prev, next *instruction) *instruction {
prev.next = next
next.prev = prev
return next
}

View File

@ -0,0 +1,428 @@
package arm64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
var calleeSavedRegistersSorted = []regalloc.VReg{
x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg,
v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
}
// CompileGoFunctionTrampoline implements backend.Machine.
func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte {
exct := m.executableContext
argBegin := 1 // Skips exec context by default.
if needModuleContextPtr {
argBegin++
}
abi := &backend.FunctionABI{}
abi.Init(sig, intParamResultRegs, floatParamResultRegs)
m.currentABI = abi
cur := m.allocateInstr()
cur.asNop0()
exct.RootInstr = cur
// Execution context is always the first argument.
execCtrPtr := x0VReg
// In the following, we create the following stack layout:
//
// (high address)
// SP ------> +-----------------+ <----+
// | ....... | |
// | ret Y | |
// | ....... | |
// | ret 0 | |
// | arg X | | size_of_arg_ret
// | ....... | |
// | arg 1 | |
// | arg 0 | <----+ <-------- originalArg0Reg
// | size_of_arg_ret |
// | ReturnAddress |
// +-----------------+ <----+
// | xxxx | | ;; might be padded to make it 16-byte aligned.
// +--->| arg[N]/ret[M] | |
// sliceSize| | ............ | | goCallStackSize
// | | arg[1]/ret[1] | |
// +--->| arg[0]/ret[0] | <----+ <-------- arg0ret0AddrReg
// | sliceSize |
// | frame_size |
// +-----------------+
// (low address)
//
// where the region of "arg[0]/ret[0] ... arg[N]/ret[M]" is the stack used by the Go functions,
// therefore will be accessed as the usual []uint64. So that's where we need to pass/receive
// the arguments/return values.
// First of all, to update the SP, and create "ReturnAddress + size_of_arg_ret".
cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
const frameInfoSize = 16 // == frame_size + sliceSize.
// Next, we should allocate the stack for the Go function call if necessary.
goCallStackSize, sliceSizeInBytes := backend.GoFunctionCallRequiredStackSize(sig, argBegin)
cur = m.insertStackBoundsCheck(goCallStackSize+frameInfoSize, cur)
originalArg0Reg := x17VReg // Caller save, so we can use it for whatever we want.
if m.currentABI.AlignedArgResultStackSlotSize() > 0 {
// At this point, SP points to `ReturnAddress`, so add 16 to get the original arg 0 slot.
cur = m.addsAddOrSubStackPointer(cur, originalArg0Reg, frameInfoSize, true)
}
// Save the callee saved registers.
cur = m.saveRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
if needModuleContextPtr {
offset := wazevoapi.ExecutionContextOffsetGoFunctionCallCalleeModuleContextOpaque.I64()
if !offsetFitsInAddressModeKindRegUnsignedImm12(64, offset) {
panic("BUG: too large or un-aligned offset for goFunctionCallCalleeModuleContextOpaque in execution context")
}
// Module context is always the second argument.
moduleCtrPtr := x1VReg
store := m.allocateInstr()
amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
store.asStore(operandNR(moduleCtrPtr), amode, 64)
cur = linkInstr(cur, store)
}
// Advances the stack pointer.
cur = m.addsAddOrSubStackPointer(cur, spVReg, goCallStackSize, false)
// Copy the pointer to x15VReg.
arg0ret0AddrReg := x15VReg // Caller save, so we can use it for whatever we want.
copySp := m.allocateInstr()
copySp.asMove64(arg0ret0AddrReg, spVReg)
cur = linkInstr(cur, copySp)
// Next, we need to store all the arguments to the stack in the typical Wasm stack style.
for i := range abi.Args[argBegin:] {
arg := &abi.Args[argBegin+i]
store := m.allocateInstr()
var v regalloc.VReg
if arg.Kind == backend.ABIArgKindReg {
v = arg.Reg
} else {
cur, v = m.goFunctionCallLoadStackArg(cur, originalArg0Reg, arg,
// Caller save, so we can use it for whatever we want.
x11VReg, v11VReg)
}
var sizeInBits byte
if arg.Type == ssa.TypeV128 {
sizeInBits = 128
} else {
sizeInBits = 64
}
store.asStore(operandNR(v),
addressMode{
kind: addressModeKindPostIndex,
rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8),
}, sizeInBits)
cur = linkInstr(cur, store)
}
// Finally, now that we've advanced SP to arg[0]/ret[0], we allocate `frame_size + sliceSize`.
var frameSizeReg, sliceSizeReg regalloc.VReg
if goCallStackSize > 0 {
cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, goCallStackSize)
frameSizeReg = tmpRegVReg
cur = m.lowerConstantI64AndInsert(cur, x16VReg, sliceSizeInBytes/8)
sliceSizeReg = x16VReg
} else {
frameSizeReg = xzrVReg
sliceSizeReg = xzrVReg
}
_amode := addressModePreOrPostIndex(spVReg, -16, true)
storeP := m.allocateInstr()
storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
cur = linkInstr(cur, storeP)
// Set the exit status on the execution context.
cur = m.setExitCode(cur, x0VReg, exitCode)
// Save the current stack pointer.
cur = m.saveCurrentStackPointer(cur, x0VReg)
// Exit the execution.
cur = m.storeReturnAddressAndExit(cur)
// After the call, we need to restore the callee saved registers.
cur = m.restoreRegistersInExecutionContext(cur, calleeSavedRegistersSorted)
// Get the pointer to the arg[0]/ret[0]: We need to skip `frame_size + sliceSize`.
if len(abi.Rets) > 0 {
cur = m.addsAddOrSubStackPointer(cur, arg0ret0AddrReg, frameInfoSize, true)
}
// Advances the SP so that it points to `ReturnAddress`.
cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
ldr := m.allocateInstr()
// And load the return address.
ldr.asULoad(operandNR(lrVReg),
addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
cur = linkInstr(cur, ldr)
originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
if m.currentABI.RetStackSize > 0 {
cur = m.addsAddOrSubStackPointer(cur, originalRet0Reg, m.currentABI.ArgStackSize, true)
}
// Make the SP point to the original address (above the result slot).
if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
}
for i := range abi.Rets {
r := &abi.Rets[i]
if r.Kind == backend.ABIArgKindReg {
loadIntoReg := m.allocateInstr()
mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
switch r.Type {
case ssa.TypeI32:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
case ssa.TypeI64:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
case ssa.TypeF32:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
case ssa.TypeF64:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
case ssa.TypeV128:
mode.imm = 16
loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
default:
panic("TODO")
}
cur = linkInstr(cur, loadIntoReg)
} else {
// First we need to load the value to a temporary just like ^^.
intTmp, floatTmp := x11VReg, v11VReg
loadIntoTmpReg := m.allocateInstr()
mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
var resultReg regalloc.VReg
switch r.Type {
case ssa.TypeI32:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
resultReg = intTmp
case ssa.TypeI64:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
resultReg = intTmp
case ssa.TypeF32:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
resultReg = floatTmp
case ssa.TypeF64:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
resultReg = floatTmp
case ssa.TypeV128:
mode.imm = 16
loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
resultReg = floatTmp
default:
panic("TODO")
}
cur = linkInstr(cur, loadIntoTmpReg)
cur = m.goFunctionCallStoreStackResult(cur, originalRet0Reg, r, resultReg)
}
}
ret := m.allocateInstr()
ret.asRet()
linkInstr(cur, ret)
m.encode(m.executableContext.RootInstr)
return m.compiler.Buf()
}
func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
for _, v := range regs {
store := m.allocateInstr()
var sizeInBits byte
switch v.RegType() {
case regalloc.RegTypeInt:
sizeInBits = 64
case regalloc.RegTypeFloat:
sizeInBits = 128
}
store.asStore(operandNR(v),
addressMode{
kind: addressModeKindRegUnsignedImm12,
// Execution context is always the first argument.
rn: x0VReg, imm: offset,
}, sizeInBits)
store.prev = cur
cur.next = store
cur = store
offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally store regs at the offset of multiple of 16.
}
return cur
}
func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []regalloc.VReg) *instruction {
offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
for _, v := range regs {
load := m.allocateInstr()
var as func(dst operand, amode addressMode, sizeInBits byte)
var sizeInBits byte
switch v.RegType() {
case regalloc.RegTypeInt:
as = load.asULoad
sizeInBits = 64
case regalloc.RegTypeFloat:
as = load.asFpuLoad
sizeInBits = 128
}
as(operandNR(v),
addressMode{
kind: addressModeKindRegUnsignedImm12,
// Execution context is always the first argument.
rn: x0VReg, imm: offset,
}, sizeInBits)
cur = linkInstr(cur, load)
offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
}
return cur
}
func (m *machine) lowerConstantI64AndInsert(cur *instruction, dst regalloc.VReg, v int64) *instruction {
exct := m.executableContext
exct.PendingInstructions = exct.PendingInstructions[:0]
m.lowerConstantI64(dst, v)
for _, instr := range exct.PendingInstructions {
cur = linkInstr(cur, instr)
}
return cur
}
func (m *machine) lowerConstantI32AndInsert(cur *instruction, dst regalloc.VReg, v int32) *instruction {
exct := m.executableContext
exct.PendingInstructions = exct.PendingInstructions[:0]
m.lowerConstantI32(dst, v)
for _, instr := range exct.PendingInstructions {
cur = linkInstr(cur, instr)
}
return cur
}
func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode wazevoapi.ExitCode) *instruction {
constReg := x17VReg // caller-saved, so we can use it.
cur = m.lowerConstantI32AndInsert(cur, constReg, int32(exitCode))
// Set the exit status on the execution context.
setExistStatus := m.allocateInstr()
setExistStatus.asStore(operandNR(constReg),
addressMode{
kind: addressModeKindRegUnsignedImm12,
rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
}, 32)
cur = linkInstr(cur, setExistStatus)
return cur
}
func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
// Read the return address into tmp, and store it in the execution context.
adr := m.allocateInstr()
adr.asAdr(tmpRegVReg, exitSequenceSize+8)
cur = linkInstr(cur, adr)
storeReturnAddr := m.allocateInstr()
storeReturnAddr.asStore(operandNR(tmpRegVReg),
addressMode{
kind: addressModeKindRegUnsignedImm12,
// Execution context is always the first argument.
rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
}, 64)
cur = linkInstr(cur, storeReturnAddr)
// Exit the execution.
trapSeq := m.allocateInstr()
trapSeq.asExitSequence(x0VReg)
cur = linkInstr(cur, trapSeq)
return cur
}
func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VReg) *instruction {
// Save the current stack pointer:
// mov tmp, sp,
// str tmp, [exec_ctx, #stackPointerBeforeGoCall]
movSp := m.allocateInstr()
movSp.asMove64(tmpRegVReg, spVReg)
cur = linkInstr(cur, movSp)
strSp := m.allocateInstr()
strSp.asStore(operandNR(tmpRegVReg),
addressMode{
kind: addressModeKindRegUnsignedImm12,
rn: execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
}, 64)
cur = linkInstr(cur, strSp)
return cur
}
func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
load := m.allocateInstr()
var result regalloc.VReg
mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
switch arg.Type {
case ssa.TypeI32:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
load.asULoad(operandNR(intVReg), mode, 32)
result = intVReg
case ssa.TypeI64:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
load.asULoad(operandNR(intVReg), mode, 64)
result = intVReg
case ssa.TypeF32:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
load.asFpuLoad(operandNR(floatVReg), mode, 32)
result = floatVReg
case ssa.TypeF64:
mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
load.asFpuLoad(operandNR(floatVReg), mode, 64)
result = floatVReg
case ssa.TypeV128:
mode.imm = 16
load.asFpuLoad(operandNR(floatVReg), mode, 128)
result = floatVReg
default:
panic("TODO")
}
cur = linkInstr(cur, load)
return cur, result
}
func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
store := m.allocateInstr()
mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
var sizeInBits byte
switch result.Type {
case ssa.TypeI32, ssa.TypeF32:
mode.imm = 8
sizeInBits = 32
case ssa.TypeI64, ssa.TypeF64:
mode.imm = 8
sizeInBits = 64
case ssa.TypeV128:
mode.imm = 16
sizeInBits = 128
default:
panic("TODO")
}
store.asStore(operandNR(resultVReg), mode, sizeInBits)
return linkInstr(cur, store)
}

View File

@ -0,0 +1,215 @@
package arm64
import (
"strconv"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
type (
cond uint64
condKind byte
)
const (
// condKindRegisterZero represents a condition which checks if the register is zero.
// This indicates that the instruction must be encoded as CBZ:
// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBZ--Compare-and-Branch-on-Zero-
condKindRegisterZero condKind = iota
// condKindRegisterNotZero indicates that the instruction must be encoded as CBNZ:
// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/CBNZ--Compare-and-Branch-on-Nonzero-
condKindRegisterNotZero
// condKindCondFlagSet indicates that the instruction must be encoded as B.cond:
// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
condKindCondFlagSet
)
// kind returns the kind of condition which is stored in the first two bits.
func (c cond) kind() condKind {
return condKind(c & 0b11)
}
func (c cond) asUint64() uint64 {
return uint64(c)
}
// register returns the register for register conditions.
// This panics if the condition is not a register condition (condKindRegisterZero or condKindRegisterNotZero).
func (c cond) register() regalloc.VReg {
if c.kind() != condKindRegisterZero && c.kind() != condKindRegisterNotZero {
panic("condition is not a register")
}
return regalloc.VReg(c >> 2)
}
func registerAsRegZeroCond(r regalloc.VReg) cond {
return cond(r)<<2 | cond(condKindRegisterZero)
}
func registerAsRegNotZeroCond(r regalloc.VReg) cond {
return cond(r)<<2 | cond(condKindRegisterNotZero)
}
func (c cond) flag() condFlag {
if c.kind() != condKindCondFlagSet {
panic("condition is not a flag")
}
return condFlag(c >> 2)
}
func (c condFlag) asCond() cond {
return cond(c)<<2 | cond(condKindCondFlagSet)
}
// condFlag represents a condition flag for conditional branches.
// The value matches the encoding of condition flags in the ARM64 instruction set.
// https://developer.arm.com/documentation/den0024/a/The-A64-instruction-set/Data-processing-instructions/Conditional-instructions
type condFlag uint8
const (
eq condFlag = iota // eq represents "equal"
ne // ne represents "not equal"
hs // hs represents "higher or same"
lo // lo represents "lower"
mi // mi represents "minus or negative result"
pl // pl represents "plus or positive result"
vs // vs represents "overflow set"
vc // vc represents "overflow clear"
hi // hi represents "higher"
ls // ls represents "lower or same"
ge // ge represents "greater or equal"
lt // lt represents "less than"
gt // gt represents "greater than"
le // le represents "less than or equal"
al // al represents "always"
nv // nv represents "never"
)
// invert returns the inverted condition.
func (c condFlag) invert() condFlag {
switch c {
case eq:
return ne
case ne:
return eq
case hs:
return lo
case lo:
return hs
case mi:
return pl
case pl:
return mi
case vs:
return vc
case vc:
return vs
case hi:
return ls
case ls:
return hi
case ge:
return lt
case lt:
return ge
case gt:
return le
case le:
return gt
case al:
return nv
case nv:
return al
default:
panic(c)
}
}
// String implements fmt.Stringer.
func (c condFlag) String() string {
switch c {
case eq:
return "eq"
case ne:
return "ne"
case hs:
return "hs"
case lo:
return "lo"
case mi:
return "mi"
case pl:
return "pl"
case vs:
return "vs"
case vc:
return "vc"
case hi:
return "hi"
case ls:
return "ls"
case ge:
return "ge"
case lt:
return "lt"
case gt:
return "gt"
case le:
return "le"
case al:
return "al"
case nv:
return "nv"
default:
panic(strconv.Itoa(int(c)))
}
}
// condFlagFromSSAIntegerCmpCond returns the condition flag for the given ssa.IntegerCmpCond.
func condFlagFromSSAIntegerCmpCond(c ssa.IntegerCmpCond) condFlag {
switch c {
case ssa.IntegerCmpCondEqual:
return eq
case ssa.IntegerCmpCondNotEqual:
return ne
case ssa.IntegerCmpCondSignedLessThan:
return lt
case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
return ge
case ssa.IntegerCmpCondSignedGreaterThan:
return gt
case ssa.IntegerCmpCondSignedLessThanOrEqual:
return le
case ssa.IntegerCmpCondUnsignedLessThan:
return lo
case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual:
return hs
case ssa.IntegerCmpCondUnsignedGreaterThan:
return hi
case ssa.IntegerCmpCondUnsignedLessThanOrEqual:
return ls
default:
panic(c)
}
}
// condFlagFromSSAFloatCmpCond returns the condition flag for the given ssa.FloatCmpCond.
func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag {
switch c {
case ssa.FloatCmpCondEqual:
return eq
case ssa.FloatCmpCondNotEqual:
return ne
case ssa.FloatCmpCondLessThan:
return mi
case ssa.FloatCmpCondLessThanOrEqual:
return ls
case ssa.FloatCmpCondGreaterThan:
return gt
case ssa.FloatCmpCondGreaterThanOrEqual:
return ge
default:
panic(c)
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,301 @@
package arm64
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// lowerConstant allocates a new VReg and inserts the instruction to load the constant value.
func (m *machine) lowerConstant(instr *ssa.Instruction) (vr regalloc.VReg) {
val := instr.Return()
valType := val.Type()
vr = m.compiler.AllocateVReg(valType)
v := instr.ConstantVal()
m.insertLoadConstant(v, valType, vr)
return
}
// InsertLoadConstantBlockArg implements backend.Machine.
func (m *machine) InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg) {
val := instr.Return()
valType := val.Type()
v := instr.ConstantVal()
load := m.allocateInstr()
load.asLoadConstBlockArg(v, valType, vr)
m.insert(load)
}
func (m *machine) lowerLoadConstantBlockArgAfterRegAlloc(i *instruction) {
v, typ, dst := i.loadConstBlockArgData()
m.insertLoadConstant(v, typ, dst)
}
func (m *machine) insertLoadConstant(v uint64, valType ssa.Type, vr regalloc.VReg) {
if valType.Bits() < 64 { // Clear the redundant bits just in case it's unexpectedly sign-extended, etc.
v = v & ((1 << valType.Bits()) - 1)
}
switch valType {
case ssa.TypeF32:
loadF := m.allocateInstr()
loadF.asLoadFpuConst32(vr, v)
m.insert(loadF)
case ssa.TypeF64:
loadF := m.allocateInstr()
loadF.asLoadFpuConst64(vr, v)
m.insert(loadF)
case ssa.TypeI32:
if v == 0 {
m.InsertMove(vr, xzrVReg, ssa.TypeI32)
} else {
m.lowerConstantI32(vr, int32(v))
}
case ssa.TypeI64:
if v == 0 {
m.InsertMove(vr, xzrVReg, ssa.TypeI64)
} else {
m.lowerConstantI64(vr, int64(v))
}
default:
panic("TODO")
}
}
// The following logics are based on the old asm/arm64 package.
// https://github.com/tetratelabs/wazero/blob/39f2ff23a6d609e10c82b9cc0b981f6de5b87a9c/internal/asm/arm64/impl.go
func (m *machine) lowerConstantI32(dst regalloc.VReg, c int32) {
// Following the logic here:
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
ic := int64(uint32(c))
if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
if isBitMaskImmediate(uint64(c), false) {
m.lowerConstViaBitMaskImmediate(uint64(uint32(c)), dst, false)
return
}
}
if t := const16bitAligned(int64(uint32(c))); t >= 0 {
// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
// We could load it into temporary with movk.
m.insertMOVZ(dst, uint64(uint32(c)>>(16*t)), t, false)
} else if t := const16bitAligned(int64(^c)); t >= 0 {
// Also, if the inverse of the const can fit within 16-bit range, do the same ^^.
m.insertMOVN(dst, uint64(^c>>(16*t)), t, false)
} else if isBitMaskImmediate(uint64(uint32(c)), false) {
m.lowerConstViaBitMaskImmediate(uint64(c), dst, false)
} else {
// Otherwise, we use MOVZ and MOVK to load it.
c16 := uint16(c)
m.insertMOVZ(dst, uint64(c16), 0, false)
c16 = uint16(uint32(c) >> 16)
m.insertMOVK(dst, uint64(c16), 1, false)
}
}
func (m *machine) lowerConstantI64(dst regalloc.VReg, c int64) {
// Following the logic here:
// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
if isBitMaskImmediate(uint64(c), true) {
m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
return
}
}
if t := const16bitAligned(c); t >= 0 {
// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
// We could load it into temporary with movk.
m.insertMOVZ(dst, uint64(c)>>(16*t), t, true)
} else if t := const16bitAligned(^c); t >= 0 {
// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
m.insertMOVN(dst, uint64(^c)>>(16*t), t, true)
} else if isBitMaskImmediate(uint64(c), true) {
m.lowerConstViaBitMaskImmediate(uint64(c), dst, true)
} else {
m.load64bitConst(c, dst)
}
}
func (m *machine) lowerConstViaBitMaskImmediate(c uint64, dst regalloc.VReg, b64 bool) {
instr := m.allocateInstr()
instr.asALUBitmaskImm(aluOpOrr, dst, xzrVReg, c, b64)
m.insert(instr)
}
// isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
//
// Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
// Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
//
// See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
func isBitMaskImmediate(x uint64, _64 bool) bool {
// All zeros and ones are not "bitmask immediate" by definition.
if x == 0 || (_64 && x == 0xffff_ffff_ffff_ffff) || (!_64 && x == 0xffff_ffff) {
return false
}
switch {
case x != x>>32|x<<32:
// e = 64
case x != x>>16|x<<48:
// e = 32 (x == x>>32|x<<32).
// e.g. 0x00ff_ff00_00ff_ff00
x = uint64(int32(x))
case x != x>>8|x<<56:
// e = 16 (x == x>>16|x<<48).
// e.g. 0x00ff_00ff_00ff_00ff
x = uint64(int16(x))
case x != x>>4|x<<60:
// e = 8 (x == x>>8|x<<56).
// e.g. 0x0f0f_0f0f_0f0f_0f0f
x = uint64(int8(x))
default:
// e = 4 or 2.
return true
}
return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
}
// sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
// For example: 0b1110 -> true, 0b1010 -> false
func sequenceOfSetbits(x uint64) bool {
y := getLowestBit(x)
// If x is a sequence of set bit, this should results in the number
// with only one set bit (i.e. power of two).
y += x
return (y-1)&y == 0
}
func getLowestBit(x uint64) uint64 {
return x & (^x + 1)
}
// const16bitAligned check if the value is on the 16-bit alignment.
// If so, returns the shift num divided by 16, and otherwise -1.
func const16bitAligned(v int64) (ret int) {
ret = -1
for s := 0; s < 64; s += 16 {
if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
ret = s / 16
break
}
}
return
}
// load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
// consts as in the Go assembler.
//
// See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
var bits [4]uint64
var zeros, negs int
for i := 0; i < 4; i++ {
bits[i] = uint64(c) >> uint(i*16) & 0xffff
if v := bits[i]; v == 0 {
zeros++
} else if v == 0xffff {
negs++
}
}
if zeros == 3 {
// one MOVZ instruction.
for i, v := range bits {
if v != 0 {
m.insertMOVZ(dst, v, i, true)
}
}
} else if negs == 3 {
// one MOVN instruction.
for i, v := range bits {
if v != 0xffff {
v = ^v
m.insertMOVN(dst, v, i, true)
}
}
} else if zeros == 2 {
// one MOVZ then one OVK.
var movz bool
for i, v := range bits {
if !movz && v != 0 { // MOVZ.
m.insertMOVZ(dst, v, i, true)
movz = true
} else if v != 0 {
m.insertMOVK(dst, v, i, true)
}
}
} else if negs == 2 {
// one MOVN then one or two MOVK.
var movn bool
for i, v := range bits { // Emit MOVN.
if !movn && v != 0xffff {
v = ^v
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
m.insertMOVN(dst, v, i, true)
movn = true
} else if v != 0xffff {
m.insertMOVK(dst, v, i, true)
}
}
} else if zeros == 1 {
// one MOVZ then two MOVK.
var movz bool
for i, v := range bits {
if !movz && v != 0 { // MOVZ.
m.insertMOVZ(dst, v, i, true)
movz = true
} else if v != 0 {
m.insertMOVK(dst, v, i, true)
}
}
} else if negs == 1 {
// one MOVN then two MOVK.
var movn bool
for i, v := range bits { // Emit MOVN.
if !movn && v != 0xffff {
v = ^v
// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
m.insertMOVN(dst, v, i, true)
movn = true
} else if v != 0xffff {
m.insertMOVK(dst, v, i, true)
}
}
} else {
// one MOVZ then up to three MOVK.
var movz bool
for i, v := range bits {
if !movz && v != 0 { // MOVZ.
m.insertMOVZ(dst, v, i, true)
movz = true
} else if v != 0 {
m.insertMOVK(dst, v, i, true)
}
}
}
}
func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
instr := m.allocateInstr()
instr.asMOVZ(dst, v, uint64(shift), dst64)
m.insert(instr)
}
func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
instr := m.allocateInstr()
instr.asMOVK(dst, v, uint64(shift), dst64)
m.insert(instr)
}
func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
instr := m.allocateInstr()
instr.asMOVN(dst, v, uint64(shift), dst64)
m.insert(instr)
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,350 @@
package arm64
// This file contains the logic to "find and determine operands" for instructions.
// In order to finalize the form of an operand, we might end up merging/eliminating
// the source instructions into an operand whenever possible.
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
type (
// operand represents an operand of an instruction whose type is determined by the kind.
operand struct {
kind operandKind
data, data2 uint64
}
operandKind byte
)
// Here's the list of operand kinds. We use the abbreviation of the kind name not only for these consts,
// but also names of functions which return the operand of the kind.
const (
// operandKindNR represents "NormalRegister" (NR). This is literally the register without any special operation unlike others.
operandKindNR operandKind = iota
// operandKindSR represents "Shifted Register" (SR). This is a register which is shifted by a constant.
// Some of the arm64 instructions can take this kind of operand.
operandKindSR
// operandKindER represents "Extended Register (ER). This is a register which is sign/zero-extended to a larger size.
// Some of the arm64 instructions can take this kind of operand.
operandKindER
// operandKindImm12 represents "Immediate 12" (Imm12). This is a 12-bit immediate value which can be either shifted by 12 or not.
// See asImm12 function for detail.
operandKindImm12
// operandKindShiftImm represents "Shifted Immediate" (ShiftImm) used by shift operations.
operandKindShiftImm
)
// String implements fmt.Stringer for debugging.
func (o operand) format(size byte) string {
switch o.kind {
case operandKindNR:
return formatVRegSized(o.nr(), size)
case operandKindSR:
r, amt, sop := o.sr()
return fmt.Sprintf("%s, %s #%d", formatVRegSized(r, size), sop, amt)
case operandKindER:
r, eop, _ := o.er()
return fmt.Sprintf("%s %s", formatVRegSized(r, size), eop)
case operandKindImm12:
imm12, shiftBit := o.imm12()
if shiftBit == 1 {
return fmt.Sprintf("#%#x", uint64(imm12)<<12)
} else {
return fmt.Sprintf("#%#x", imm12)
}
default:
panic(fmt.Sprintf("unknown operand kind: %d", o.kind))
}
}
// operandNR encodes the given VReg as an operand of operandKindNR.
func operandNR(r regalloc.VReg) operand {
return operand{kind: operandKindNR, data: uint64(r)}
}
// nr decodes the underlying VReg assuming the operand is of operandKindNR.
func (o operand) nr() regalloc.VReg {
return regalloc.VReg(o.data)
}
// operandER encodes the given VReg as an operand of operandKindER.
func operandER(r regalloc.VReg, eop extendOp, to byte) operand {
if to < 32 {
panic("TODO?BUG?: when we need to extend to less than 32 bits?")
}
return operand{kind: operandKindER, data: uint64(r), data2: uint64(eop)<<32 | uint64(to)}
}
// er decodes the underlying VReg, extend operation, and the target size assuming the operand is of operandKindER.
func (o operand) er() (r regalloc.VReg, eop extendOp, to byte) {
return regalloc.VReg(o.data), extendOp(o.data2>>32) & 0xff, byte(o.data2 & 0xff)
}
// operandSR encodes the given VReg as an operand of operandKindSR.
func operandSR(r regalloc.VReg, amt byte, sop shiftOp) operand {
return operand{kind: operandKindSR, data: uint64(r), data2: uint64(amt)<<32 | uint64(sop)}
}
// sr decodes the underlying VReg, shift amount, and shift operation assuming the operand is of operandKindSR.
func (o operand) sr() (r regalloc.VReg, amt byte, sop shiftOp) {
return regalloc.VReg(o.data), byte(o.data2>>32) & 0xff, shiftOp(o.data2) & 0xff
}
// operandImm12 encodes the given imm12 as an operand of operandKindImm12.
func operandImm12(imm12 uint16, shiftBit byte) operand {
return operand{kind: operandKindImm12, data: uint64(imm12) | uint64(shiftBit)<<32}
}
// imm12 decodes the underlying imm12 data assuming the operand is of operandKindImm12.
func (o operand) imm12() (v uint16, shiftBit byte) {
return uint16(o.data), byte(o.data >> 32)
}
// operandShiftImm encodes the given amount as an operand of operandKindShiftImm.
func operandShiftImm(amount byte) operand {
return operand{kind: operandKindShiftImm, data: uint64(amount)}
}
// shiftImm decodes the underlying shift amount data assuming the operand is of operandKindShiftImm.
func (o operand) shiftImm() byte {
return byte(o.data)
}
// reg returns the register of the operand if applicable.
func (o operand) reg() regalloc.VReg {
switch o.kind {
case operandKindNR:
return o.nr()
case operandKindSR:
r, _, _ := o.sr()
return r
case operandKindER:
r, _, _ := o.er()
return r
case operandKindImm12:
// Does not have a register.
case operandKindShiftImm:
// Does not have a register.
default:
panic(o.kind)
}
return regalloc.VRegInvalid
}
func (o operand) realReg() regalloc.RealReg {
return o.nr().RealReg()
}
func (o operand) assignReg(v regalloc.VReg) operand {
switch o.kind {
case operandKindNR:
return operandNR(v)
case operandKindSR:
_, amt, sop := o.sr()
return operandSR(v, amt, sop)
case operandKindER:
_, eop, to := o.er()
return operandER(v, eop, to)
case operandKindImm12:
// Does not have a register.
case operandKindShiftImm:
// Does not have a register.
}
panic(o.kind)
}
// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
//
// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
// If the operand can be expressed as operandKindImm12, `mode` is ignored.
func (m *machine) getOperand_Imm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
if def.IsFromBlockParam() {
return operandNR(def.BlkParamVReg)
}
instr := def.Instr
if instr.Opcode() == ssa.OpcodeIconst {
if imm12Op, ok := asImm12Operand(instr.ConstantVal()); ok {
instr.MarkLowered()
return imm12Op
}
}
return m.getOperand_ER_SR_NR(def, mode)
}
// getOperand_MaybeNegatedImm12_ER_SR_NR is almost the same as getOperand_Imm12_ER_SR_NR, but this might negate the immediate value.
// If the immediate value is negated, the second return value is true, otherwise always false.
func (m *machine) getOperand_MaybeNegatedImm12_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand, negatedImm12 bool) {
if def.IsFromBlockParam() {
return operandNR(def.BlkParamVReg), false
}
instr := def.Instr
if instr.Opcode() == ssa.OpcodeIconst {
c := instr.ConstantVal()
if imm12Op, ok := asImm12Operand(c); ok {
instr.MarkLowered()
return imm12Op, false
}
signExtended := int64(c)
if def.SSAValue().Type().Bits() == 32 {
signExtended = (signExtended << 32) >> 32
}
negatedWithoutSign := -signExtended
if imm12Op, ok := asImm12Operand(uint64(negatedWithoutSign)); ok {
instr.MarkLowered()
return imm12Op, true
}
}
return m.getOperand_ER_SR_NR(def, mode), false
}
// ensureValueNR returns an operand of either operandKindER, operandKindSR, or operandKindNR from the given value (defined by `def).
//
// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
func (m *machine) getOperand_ER_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
if def.IsFromBlockParam() {
return operandNR(def.BlkParamVReg)
}
if m.compiler.MatchInstr(def, ssa.OpcodeSExtend) || m.compiler.MatchInstr(def, ssa.OpcodeUExtend) {
extInstr := def.Instr
signed := extInstr.Opcode() == ssa.OpcodeSExtend
innerExtFromBits, innerExtToBits := extInstr.ExtendFromToBits()
modeBits, modeSigned := mode.bits(), mode.signed()
if mode == extModeNone || innerExtToBits == modeBits {
eop := extendOpFrom(signed, innerExtFromBits)
extArg := m.getOperand_NR(m.compiler.ValueDefinition(extInstr.Arg()), extModeNone)
op = operandER(extArg.nr(), eop, innerExtToBits)
extInstr.MarkLowered()
return
}
if innerExtToBits > modeBits {
panic("BUG?TODO?: need the results of inner extension to be larger than the mode")
}
switch {
case (!signed && !modeSigned) || (signed && modeSigned):
// Two sign/zero extensions are equivalent to one sign/zero extension for the larger size.
eop := extendOpFrom(modeSigned, innerExtFromBits)
op = operandER(m.compiler.VRegOf(extInstr.Arg()), eop, modeBits)
extInstr.MarkLowered()
case (signed && !modeSigned) || (!signed && modeSigned):
// We need to {sign, zero}-extend the result of the {zero,sign} extension.
eop := extendOpFrom(modeSigned, innerExtToBits)
op = operandER(m.compiler.VRegOf(extInstr.Return()), eop, modeBits)
// Note that we failed to merge the inner extension instruction this case.
}
return
}
return m.getOperand_SR_NR(def, mode)
}
// ensureValueNR returns an operand of either operandKindSR or operandKindNR from the given value (defined by `def).
//
// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
func (m *machine) getOperand_SR_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
if def.IsFromBlockParam() {
return operandNR(def.BlkParamVReg)
}
if m.compiler.MatchInstr(def, ssa.OpcodeIshl) {
// Check if the shift amount is constant instruction.
targetVal, amountVal := def.Instr.Arg2()
targetVReg := m.getOperand_NR(m.compiler.ValueDefinition(targetVal), extModeNone).nr()
amountDef := m.compiler.ValueDefinition(amountVal)
if amountDef.IsFromInstr() && amountDef.Instr.Constant() {
// If that is the case, we can use the shifted register operand (SR).
c := byte(amountDef.Instr.ConstantVal()) & (targetVal.Type().Bits() - 1) // Clears the unnecessary bits.
def.Instr.MarkLowered()
amountDef.Instr.MarkLowered()
return operandSR(targetVReg, c, shiftOpLSL)
}
}
return m.getOperand_NR(def, mode)
}
// getOperand_ShiftImm_NR returns an operand of either operandKindShiftImm or operandKindNR from the given value (defined by `def).
func (m *machine) getOperand_ShiftImm_NR(def *backend.SSAValueDefinition, mode extMode, shiftBitWidth byte) (op operand) {
if def.IsFromBlockParam() {
return operandNR(def.BlkParamVReg)
}
instr := def.Instr
if instr.Constant() {
amount := byte(instr.ConstantVal()) & (shiftBitWidth - 1) // Clears the unnecessary bits.
return operandShiftImm(amount)
}
return m.getOperand_NR(def, mode)
}
// ensureValueNR returns an operand of operandKindNR from the given value (defined by `def).
//
// `mode` is used to extend the operand if the bit length is smaller than mode.bits().
func (m *machine) getOperand_NR(def *backend.SSAValueDefinition, mode extMode) (op operand) {
var v regalloc.VReg
if def.IsFromBlockParam() {
v = def.BlkParamVReg
} else {
instr := def.Instr
if instr.Constant() {
// We inline all the constant instructions so that we could reduce the register usage.
v = m.lowerConstant(instr)
instr.MarkLowered()
} else {
if n := def.N; n == 0 {
v = m.compiler.VRegOf(instr.Return())
} else {
_, rs := instr.Returns()
v = m.compiler.VRegOf(rs[n-1])
}
}
}
r := v
switch inBits := def.SSAValue().Type().Bits(); {
case mode == extModeNone:
case inBits == 32 && (mode == extModeZeroExtend32 || mode == extModeSignExtend32):
case inBits == 32 && mode == extModeZeroExtend64:
extended := m.compiler.AllocateVReg(ssa.TypeI64)
ext := m.allocateInstr()
ext.asExtend(extended, v, 32, 64, false)
m.insert(ext)
r = extended
case inBits == 32 && mode == extModeSignExtend64:
extended := m.compiler.AllocateVReg(ssa.TypeI64)
ext := m.allocateInstr()
ext.asExtend(extended, v, 32, 64, true)
m.insert(ext)
r = extended
case inBits == 64 && (mode == extModeZeroExtend64 || mode == extModeSignExtend64):
}
return operandNR(r)
}
func asImm12Operand(val uint64) (op operand, ok bool) {
v, shiftBit, ok := asImm12(val)
if !ok {
return operand{}, false
}
return operandImm12(v, shiftBit), true
}
func asImm12(val uint64) (v uint16, shiftBit byte, ok bool) {
const mask1, mask2 uint64 = 0xfff, 0xfff_000
if val&^mask1 == 0 {
return uint16(val), 0, true
} else if val&^mask2 == 0 {
return uint16(val >> 12), 1, true
} else {
return 0, 0, false
}
}

View File

@ -0,0 +1,440 @@
package arm64
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
type (
// addressMode represents an ARM64 addressing mode.
//
// https://developer.arm.com/documentation/102374/0101/Loads-and-stores---addressing
// TODO: use the bit-packed layout like operand struct.
addressMode struct {
kind addressModeKind
rn, rm regalloc.VReg
extOp extendOp
imm int64
}
// addressModeKind represents the kind of ARM64 addressing mode.
addressModeKind byte
)
const (
// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
// and then scaled by bits(type)/8.
//
// e.g.
// - ldrh w1, [x2, w3, SXTW #1] ;; sign-extended and scaled by 2 (== LSL #1)
// - strh w1, [x2, w3, UXTW #1] ;; zero-extended and scaled by 2 (== LSL #1)
// - ldr w1, [x2, w3, SXTW #2] ;; sign-extended and scaled by 4 (== LSL #2)
// - str x1, [x2, w3, UXTW #3] ;; zero-extended and scaled by 8 (== LSL #3)
//
// See the following pages:
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--register---Load-Register-Halfword--register--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--
addressModeKindRegScaledExtended addressModeKind = iota
// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without extension factor.
addressModeKindRegScaled
// addressModeKindRegScaled is the same as addressModeKindRegScaledExtended, but without scale factor.
addressModeKindRegExtended
// addressModeKindRegReg takes a base register and an index register. The index register is not either scaled or extended.
addressModeKindRegReg
// addressModeKindRegSignedImm9 takes a base register and a 9-bit "signed" immediate offset (-256 to 255).
// The immediate will be sign-extended, and be added to the base register.
// This is a.k.a. "unscaled" since the immediate is not scaled.
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--
addressModeKindRegSignedImm9
// addressModeKindRegUnsignedImm12 takes a base register and a 12-bit "unsigned" immediate offset. scaled by
// the size of the type. In other words, the actual offset will be imm12 * bits(type)/8.
// See "Unsigned offset" in the following pages:
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
addressModeKindRegUnsignedImm12
// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
// After the load/store, the base register will be updated by the offset.
//
// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
//
// See "Post-index" in the following pages for examples:
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
addressModeKindPostIndex
// addressModePostIndex takes a base register and a 9-bit "signed" immediate offset.
// Before the load/store, the base register will be updated by the offset.
//
// Note that when this is used for pair load/store, the offset will be 7-bit "signed" immediate offset.
//
// See "Pre-index" in the following pages for examples:
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRB--immediate---Load-Register-Byte--immediate--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-
addressModeKindPreIndex
// addressModeKindArgStackSpace is used to resolve the address of the argument stack space
// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
addressModeKindArgStackSpace
// addressModeKindResultStackSpace is used to resolve the address of the result stack space
// exiting right above the stack pointer. Since we don't know the exact stack space needed for a function
// at a compilation phase, this is used as a placeholder and further lowered to a real addressing mode like above.
addressModeKindResultStackSpace
)
func (a addressMode) format(dstSizeBits byte) (ret string) {
base := formatVRegSized(a.rn, 64)
if rn := a.rn; rn.RegType() != regalloc.RegTypeInt {
panic("invalid base register type: " + a.rn.RegType().String())
} else if rn.IsRealReg() && v0 <= a.rn.RealReg() && a.rn.RealReg() <= v30 {
panic("BUG: likely a bug in reg alloc or reset behavior")
}
switch a.kind {
case addressModeKindRegScaledExtended:
amount := a.sizeInBitsToShiftAmount(dstSizeBits)
ret = fmt.Sprintf("[%s, %s, %s #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp, amount)
case addressModeKindRegScaled:
amount := a.sizeInBitsToShiftAmount(dstSizeBits)
ret = fmt.Sprintf("[%s, %s, lsl #%#x]", base, formatVRegSized(a.rm, a.indexRegBits()), amount)
case addressModeKindRegExtended:
ret = fmt.Sprintf("[%s, %s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()), a.extOp)
case addressModeKindRegReg:
ret = fmt.Sprintf("[%s, %s]", base, formatVRegSized(a.rm, a.indexRegBits()))
case addressModeKindRegSignedImm9:
if a.imm != 0 {
ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
} else {
ret = fmt.Sprintf("[%s]", base)
}
case addressModeKindRegUnsignedImm12:
if a.imm != 0 {
ret = fmt.Sprintf("[%s, #%#x]", base, a.imm)
} else {
ret = fmt.Sprintf("[%s]", base)
}
case addressModeKindPostIndex:
ret = fmt.Sprintf("[%s], #%#x", base, a.imm)
case addressModeKindPreIndex:
ret = fmt.Sprintf("[%s, #%#x]!", base, a.imm)
case addressModeKindArgStackSpace:
ret = fmt.Sprintf("[#arg_space, #%#x]", a.imm)
case addressModeKindResultStackSpace:
ret = fmt.Sprintf("[#ret_space, #%#x]", a.imm)
}
return
}
func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
}
if preIndex {
return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
} else {
return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
}
}
func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
divisor := int64(dstSizeInBits) / 8
return 0 < offset && offset%divisor == 0 && offset/divisor < 4096
}
func offsetFitsInAddressModeKindRegSignedImm9(offset int64) bool {
return -256 <= offset && offset <= 255
}
func (a addressMode) indexRegBits() byte {
bits := a.extOp.srcBits()
if bits != 32 && bits != 64 {
panic("invalid index register for address mode. it must be either 32 or 64 bits")
}
return bits
}
func (a addressMode) sizeInBitsToShiftAmount(sizeInBits byte) (lsl byte) {
switch sizeInBits {
case 8:
lsl = 0
case 16:
lsl = 1
case 32:
lsl = 2
case 64:
lsl = 3
}
return
}
func extLoadSignSize(op ssa.Opcode) (size byte, signed bool) {
switch op {
case ssa.OpcodeUload8:
size, signed = 8, false
case ssa.OpcodeUload16:
size, signed = 16, false
case ssa.OpcodeUload32:
size, signed = 32, false
case ssa.OpcodeSload8:
size, signed = 8, true
case ssa.OpcodeSload16:
size, signed = 16, true
case ssa.OpcodeSload32:
size, signed = 32, true
default:
panic("BUG")
}
return
}
func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret regalloc.VReg) {
size, signed := extLoadSignSize(op)
amode := m.lowerToAddressMode(ptr, offset, size)
load := m.allocateInstr()
if signed {
load.asSLoad(operandNR(ret), amode, size)
} else {
load.asULoad(operandNR(ret), amode, size)
}
m.insert(load)
}
func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.Value) {
amode := m.lowerToAddressMode(ptr, offset, typ.Bits())
dst := m.compiler.VRegOf(ret)
load := m.allocateInstr()
switch typ {
case ssa.TypeI32, ssa.TypeI64:
load.asULoad(operandNR(dst), amode, typ.Bits())
case ssa.TypeF32, ssa.TypeF64:
load.asFpuLoad(operandNR(dst), amode, typ.Bits())
case ssa.TypeV128:
load.asFpuLoad(operandNR(dst), amode, 128)
default:
panic("TODO")
}
m.insert(load)
}
func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane, ret ssa.Value) {
// vecLoad1R has offset address mode (base+imm) only for post index, so we simply add the offset to the base.
base := m.getOperand_NR(m.compiler.ValueDefinition(ptr), extModeNone).nr()
offsetReg := m.compiler.AllocateVReg(ssa.TypeI64)
m.lowerConstantI64(offsetReg, int64(offset))
addedBase := m.addReg64ToReg64(base, offsetReg)
rd := operandNR(m.compiler.VRegOf(ret))
ld1r := m.allocateInstr()
ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
m.insert(ld1r)
}
func (m *machine) lowerStore(si *ssa.Instruction) {
// TODO: merge consecutive stores into a single pair store instruction.
value, ptr, offset, storeSizeInBits := si.StoreData()
amode := m.lowerToAddressMode(ptr, offset, storeSizeInBits)
valueOp := m.getOperand_NR(m.compiler.ValueDefinition(value), extModeNone)
store := m.allocateInstr()
store.asStore(valueOp, amode, storeSizeInBits)
m.insert(store)
}
// lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
// to support more efficient address resolution.
a32s, a64s, offset := m.collectAddends(ptr)
offset += int64(offsetBase)
return m.lowerToAddressModeFromAddends(a32s, a64s, size, offset)
}
// lowerToAddressModeFromAddends creates an addressMode from a list of addends collected by collectAddends.
// During the construction, this might emit additional instructions.
//
// Extracted as a separate function for easy testing.
func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
case a64sExist && a32sExist:
var base regalloc.VReg
base = a64s.Dequeue()
var a32 addend32
a32 = a32s.Dequeue()
amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
var base regalloc.VReg
base = a64s.Dequeue()
amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
offset = 0
case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
var base regalloc.VReg
base = a64s.Dequeue()
amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
offset = 0
case a64sExist:
var base regalloc.VReg
base = a64s.Dequeue()
if !a64s.Empty() {
index := a64s.Dequeue()
amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
} else {
amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
}
case a32sExist:
base32 := a32s.Dequeue()
// First we need 64-bit base.
base := m.compiler.AllocateVReg(ssa.TypeI64)
baseExt := m.allocateInstr()
var signed bool
if base32.ext == extendOpSXTW {
signed = true
}
baseExt.asExtend(base, base32.r, 32, 64, signed)
m.insert(baseExt)
if !a32s.Empty() {
index := a32s.Dequeue()
amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
} else {
amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
}
default: // Only static offsets.
tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
m.lowerConstantI64(tmpReg, offset)
amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
offset = 0
}
baseReg := amode.rn
if offset > 0 {
baseReg = m.addConstToReg64(baseReg, offset) // baseReg += offset
}
for !a64s.Empty() {
a64 := a64s.Dequeue()
baseReg = m.addReg64ToReg64(baseReg, a64) // baseReg += a64
}
for !a32s.Empty() {
a32 := a32s.Dequeue()
baseReg = m.addRegToReg64Ext(baseReg, a32.r, a32.ext) // baseReg += (a32 extended to 64-bit)
}
amode.rn = baseReg
return
}
var addendsMatchOpcodes = [4]ssa.Opcode{ssa.OpcodeUExtend, ssa.OpcodeSExtend, ssa.OpcodeIadd, ssa.OpcodeIconst}
func (m *machine) collectAddends(ptr ssa.Value) (addends32 *wazevoapi.Queue[addend32], addends64 *wazevoapi.Queue[regalloc.VReg], offset int64) {
m.addendsWorkQueue.Reset()
m.addends32.Reset()
m.addends64.Reset()
m.addendsWorkQueue.Enqueue(ptr)
for !m.addendsWorkQueue.Empty() {
v := m.addendsWorkQueue.Dequeue()
def := m.compiler.ValueDefinition(v)
switch op := m.compiler.MatchInstrOneOf(def, addendsMatchOpcodes[:]); op {
case ssa.OpcodeIadd:
// If the addend is an add, we recursively collect its operands.
x, y := def.Instr.Arg2()
m.addendsWorkQueue.Enqueue(x)
m.addendsWorkQueue.Enqueue(y)
def.Instr.MarkLowered()
case ssa.OpcodeIconst:
// If the addend is constant, we just statically merge it into the offset.
ic := def.Instr
u64 := ic.ConstantVal()
if ic.Return().Type().Bits() == 32 {
offset += int64(int32(u64)) // sign-extend.
} else {
offset += int64(u64)
}
def.Instr.MarkLowered()
case ssa.OpcodeUExtend, ssa.OpcodeSExtend:
input := def.Instr.Arg()
if input.Type().Bits() != 32 {
panic("illegal size: " + input.Type().String())
}
var ext extendOp
if op == ssa.OpcodeUExtend {
ext = extendOpUXTW
} else {
ext = extendOpSXTW
}
inputDef := m.compiler.ValueDefinition(input)
constInst := inputDef.IsFromInstr() && inputDef.Instr.Constant()
switch {
case constInst && ext == extendOpUXTW:
// Zero-extension of a 32-bit constant can be merged into the offset.
offset += int64(uint32(inputDef.Instr.ConstantVal()))
case constInst && ext == extendOpSXTW:
// Sign-extension of a 32-bit constant can be merged into the offset.
offset += int64(int32(inputDef.Instr.ConstantVal())) // sign-extend!
default:
m.addends32.Enqueue(addend32{r: m.getOperand_NR(inputDef, extModeNone).nr(), ext: ext})
}
def.Instr.MarkLowered()
continue
default:
// If the addend is not one of them, we simply use it as-is (without merging!), optionally zero-extending it.
m.addends64.Enqueue(m.getOperand_NR(def, extModeZeroExtend64 /* optional zero ext */).nr())
}
}
return &m.addends32, &m.addends64, offset
}
func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
rd = m.compiler.AllocateVReg(ssa.TypeI64)
alu := m.allocateInstr()
if imm12Op, ok := asImm12Operand(uint64(c)); ok {
alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
} else {
tmp := m.compiler.AllocateVReg(ssa.TypeI64)
m.load64bitConst(c, tmp)
alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
}
m.insert(alu)
return
}
func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
rd = m.compiler.AllocateVReg(ssa.TypeI64)
alu := m.allocateInstr()
alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
m.insert(alu)
return
}
func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
rd = m.compiler.AllocateVReg(ssa.TypeI64)
alu := m.allocateInstr()
alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
m.insert(alu)
return
}

View File

@ -0,0 +1,515 @@
package arm64
import (
"context"
"fmt"
"strings"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
type (
// machine implements backend.Machine.
machine struct {
compiler backend.Compiler
executableContext *backend.ExecutableContextT[instruction]
currentABI *backend.FunctionABI
regAlloc regalloc.Allocator
regAllocFn *backend.RegAllocFunction[*instruction, *machine]
// addendsWorkQueue is used during address lowering, defined here for reuse.
addendsWorkQueue wazevoapi.Queue[ssa.Value]
addends32 wazevoapi.Queue[addend32]
// addends64 is used during address lowering, defined here for reuse.
addends64 wazevoapi.Queue[regalloc.VReg]
unresolvedAddressModes []*instruction
// condBrRelocs holds the conditional branches which need offset relocation.
condBrRelocs []condBrReloc
// jmpTableTargets holds the labels of the jump table targets.
jmpTableTargets [][]uint32
// spillSlotSize is the size of the stack slot in bytes used for spilling registers.
// During the execution of the function, the stack looks like:
//
//
// (high address)
// +-----------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | xxxxx |
// | ReturnAddress |
// +-----------------+ <<-|
// | ........... | |
// | spill slot M | | <--- spillSlotSize
// | ............ | |
// | spill slot 2 | |
// | spill slot 1 | <<-+
// | clobbered N |
// | ........... |
// | clobbered 1 |
// | clobbered 0 |
// SP---> +-----------------+
// (low address)
//
// and it represents the size of the space between FP and the first spilled slot. This must be a multiple of 16.
// Also note that this is only known after register allocation.
spillSlotSize int64
spillSlots map[regalloc.VRegID]int64 // regalloc.VRegID to offset.
// clobberedRegs holds real-register backed VRegs saved at the function prologue, and restored at the epilogue.
clobberedRegs []regalloc.VReg
maxRequiredStackSizeForCalls int64
stackBoundsCheckDisabled bool
regAllocStarted bool
}
addend32 struct {
r regalloc.VReg
ext extendOp
}
condBrReloc struct {
cbr *instruction
// currentLabelPos is the labelPosition within which condBr is defined.
currentLabelPos *labelPosition
// Next block's labelPosition.
nextLabel label
offset int64
}
labelPosition = backend.LabelPosition[instruction]
label = backend.Label
)
const (
labelReturn = backend.LabelReturn
labelInvalid = backend.LabelInvalid
)
// NewBackend returns a new backend for arm64.
func NewBackend() backend.Machine {
m := &machine{
spillSlots: make(map[regalloc.VRegID]int64),
executableContext: newExecutableContext(),
regAlloc: regalloc.NewAllocator(regInfo),
}
return m
}
func newExecutableContext() *backend.ExecutableContextT[instruction] {
return backend.NewExecutableContextT[instruction](resetInstruction, setNext, setPrev, asNop0)
}
// ExecutableContext implements backend.Machine.
func (m *machine) ExecutableContext() backend.ExecutableContext {
return m.executableContext
}
// RegAlloc implements backend.Machine Function.
func (m *machine) RegAlloc() {
rf := m.regAllocFn
for _, pos := range m.executableContext.OrderedBlockLabels {
rf.AddBlock(pos.SB, pos.L, pos.Begin, pos.End)
}
m.regAllocStarted = true
m.regAlloc.DoAllocation(rf)
// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
m.spillSlotSize = (m.spillSlotSize + 15) &^ 15
}
// Reset implements backend.Machine.
func (m *machine) Reset() {
m.clobberedRegs = m.clobberedRegs[:0]
for key := range m.spillSlots {
m.clobberedRegs = append(m.clobberedRegs, regalloc.VReg(key))
}
for _, key := range m.clobberedRegs {
delete(m.spillSlots, regalloc.VRegID(key))
}
m.clobberedRegs = m.clobberedRegs[:0]
m.regAllocStarted = false
m.regAlloc.Reset()
m.regAllocFn.Reset()
m.spillSlotSize = 0
m.unresolvedAddressModes = m.unresolvedAddressModes[:0]
m.maxRequiredStackSizeForCalls = 0
m.executableContext.Reset()
m.jmpTableTargets = m.jmpTableTargets[:0]
}
// SetCurrentABI implements backend.Machine SetCurrentABI.
func (m *machine) SetCurrentABI(abi *backend.FunctionABI) {
m.currentABI = abi
}
// DisableStackCheck implements backend.Machine DisableStackCheck.
func (m *machine) DisableStackCheck() {
m.stackBoundsCheckDisabled = true
}
// SetCompiler implements backend.Machine.
func (m *machine) SetCompiler(ctx backend.Compiler) {
m.compiler = ctx
m.regAllocFn = backend.NewRegAllocFunction[*instruction, *machine](m, ctx.SSABuilder(), ctx)
}
func (m *machine) insert(i *instruction) {
ectx := m.executableContext
ectx.PendingInstructions = append(ectx.PendingInstructions, i)
}
func (m *machine) insertBrTargetLabel() label {
nop, l := m.allocateBrTarget()
m.insert(nop)
return l
}
func (m *machine) allocateBrTarget() (nop *instruction, l label) {
ectx := m.executableContext
l = ectx.AllocateLabel()
nop = m.allocateInstr()
nop.asNop0WithLabel(l)
pos := ectx.AllocateLabelPosition(l)
pos.Begin, pos.End = nop, nop
ectx.LabelPositions[l] = pos
return
}
// allocateInstr allocates an instruction.
func (m *machine) allocateInstr() *instruction {
instr := m.executableContext.InstructionPool.Allocate()
if !m.regAllocStarted {
instr.addedBeforeRegAlloc = true
}
return instr
}
func resetInstruction(i *instruction) {
*i = instruction{}
}
func (m *machine) allocateNop() *instruction {
instr := m.allocateInstr()
instr.asNop0()
return instr
}
func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
amode := &i.amode
switch amode.kind {
case addressModeKindResultStackSpace:
amode.imm += ret0offset
case addressModeKindArgStackSpace:
amode.imm += arg0offset
default:
panic("BUG")
}
var sizeInBits byte
switch i.kind {
case store8, uLoad8:
sizeInBits = 8
case store16, uLoad16:
sizeInBits = 16
case store32, fpuStore32, uLoad32, fpuLoad32:
sizeInBits = 32
case store64, fpuStore64, uLoad64, fpuLoad64:
sizeInBits = 64
case fpuStore128, fpuLoad128:
sizeInBits = 128
default:
panic("BUG")
}
if offsetFitsInAddressModeKindRegUnsignedImm12(sizeInBits, amode.imm) {
amode.kind = addressModeKindRegUnsignedImm12
} else {
// This case, we load the offset into the temporary register,
// and then use it as the index register.
newPrev := m.lowerConstantI64AndInsert(i.prev, tmpRegVReg, amode.imm)
linkInstr(newPrev, i)
*amode = addressMode{kind: addressModeKindRegReg, rn: amode.rn, rm: tmpRegVReg, extOp: extendOpUXTX /* indicates rm reg is 64-bit */}
}
}
// resolveRelativeAddresses resolves the relative addresses before encoding.
func (m *machine) resolveRelativeAddresses(ctx context.Context) {
ectx := m.executableContext
for {
if len(m.unresolvedAddressModes) > 0 {
arg0offset, ret0offset := m.arg0OffsetFromSP(), m.ret0OffsetFromSP()
for _, i := range m.unresolvedAddressModes {
m.resolveAddressingMode(arg0offset, ret0offset, i)
}
}
// Reuse the slice to gather the unresolved conditional branches.
m.condBrRelocs = m.condBrRelocs[:0]
var fn string
var fnIndex int
var labelToSSABlockID map[label]ssa.BasicBlockID
if wazevoapi.PerfMapEnabled {
fn = wazevoapi.GetCurrentFunctionName(ctx)
labelToSSABlockID = make(map[label]ssa.BasicBlockID)
for i, l := range ectx.SsaBlockIDToLabels {
labelToSSABlockID[l] = ssa.BasicBlockID(i)
}
fnIndex = wazevoapi.GetCurrentFunctionIndex(ctx)
}
// Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label.
var offset int64
for i, pos := range ectx.OrderedBlockLabels {
pos.BinaryOffset = offset
var size int64
for cur := pos.Begin; ; cur = cur.next {
switch cur.kind {
case nop0:
l := cur.nop0Label()
if pos, ok := ectx.LabelPositions[l]; ok {
pos.BinaryOffset = offset + size
}
case condBr:
if !cur.condBrOffsetResolved() {
var nextLabel label
if i < len(ectx.OrderedBlockLabels)-1 {
// Note: this is only used when the block ends with fallthrough,
// therefore can be safely assumed that the next block exists when it's needed.
nextLabel = ectx.OrderedBlockLabels[i+1].L
}
m.condBrRelocs = append(m.condBrRelocs, condBrReloc{
cbr: cur, currentLabelPos: pos, offset: offset + size,
nextLabel: nextLabel,
})
}
}
size += cur.size()
if cur == pos.End {
break
}
}
if wazevoapi.PerfMapEnabled {
if size > 0 {
l := pos.L
var labelStr string
if blkID, ok := labelToSSABlockID[l]; ok {
labelStr = fmt.Sprintf("%s::SSA_Block[%s]", l, blkID)
} else {
labelStr = l.String()
}
wazevoapi.PerfMap.AddModuleEntry(fnIndex, offset, uint64(size), fmt.Sprintf("%s:::::%s", fn, labelStr))
}
}
offset += size
}
// Before resolving any offsets, we need to check if all the conditional branches can be resolved.
var needRerun bool
for i := range m.condBrRelocs {
reloc := &m.condBrRelocs[i]
cbr := reloc.cbr
offset := reloc.offset
target := cbr.condBrLabel()
offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
diff := offsetOfTarget - offset
if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
// This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block,
// and jump to it.
m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel)
// Then, we need to recall this function to fix up the label offsets
// as they have changed after the trampoline is inserted.
needRerun = true
}
}
if needRerun {
if wazevoapi.PerfMapEnabled {
wazevoapi.PerfMap.Clear()
}
} else {
break
}
}
var currentOffset int64
for cur := ectx.RootInstr; cur != nil; cur = cur.next {
switch cur.kind {
case br:
target := cur.brLabel()
offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
diff := offsetOfTarget - currentOffset
divided := diff >> 2
if divided < minSignedInt26 || divided > maxSignedInt26 {
// This means the currently compiled single function is extremely large.
panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range")
}
cur.brOffsetResolve(diff)
case condBr:
if !cur.condBrOffsetResolved() {
target := cur.condBrLabel()
offsetOfTarget := ectx.LabelPositions[target].BinaryOffset
diff := offsetOfTarget - currentOffset
if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 {
panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly")
}
cur.condBrOffsetResolve(diff)
}
case brTableSequence:
tableIndex := cur.u1
targets := m.jmpTableTargets[tableIndex]
for i := range targets {
l := label(targets[i])
offsetOfTarget := ectx.LabelPositions[l].BinaryOffset
diff := offsetOfTarget - (currentOffset + brTableSequenceOffsetTableBegin)
targets[i] = uint32(diff)
}
cur.brTableSequenceOffsetsResolved()
case emitSourceOffsetInfo:
m.compiler.AddSourceOffsetInfo(currentOffset, cur.sourceOffsetInfo())
}
currentOffset += cur.size()
}
}
const (
maxSignedInt26 = 1<<25 - 1
minSignedInt26 = -(1 << 25)
maxSignedInt19 = 1<<18 - 1
minSignedInt19 = -(1 << 18)
)
func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) {
cur := currentBlk.End
originalTarget := cbr.condBrLabel()
endNext := cur.next
if cur.kind != br {
// If the current block ends with a conditional branch, we can just insert the trampoline after it.
// Otherwise, we need to insert "skip" instruction to skip the trampoline instructions.
skip := m.allocateInstr()
skip.asBr(nextLabel)
cur = linkInstr(cur, skip)
}
cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget()
cbr.setCondBrTargets(cbrNewTargetLabel)
cur = linkInstr(cur, cbrNewTargetInstr)
// Then insert the unconditional branch to the original, which should be possible to get encoded
// as 26-bit offset should be enough for any practical application.
br := m.allocateInstr()
br.asBr(originalTarget)
cur = linkInstr(cur, br)
// Update the end of the current block.
currentBlk.End = cur
linkInstr(cur, endNext)
}
// Format implements backend.Machine.
func (m *machine) Format() string {
ectx := m.executableContext
begins := map[*instruction]label{}
for l, pos := range ectx.LabelPositions {
begins[pos.Begin] = l
}
irBlocks := map[label]ssa.BasicBlockID{}
for i, l := range ectx.SsaBlockIDToLabels {
irBlocks[l] = ssa.BasicBlockID(i)
}
var lines []string
for cur := ectx.RootInstr; cur != nil; cur = cur.next {
if l, ok := begins[cur]; ok {
var labelStr string
if blkID, ok := irBlocks[l]; ok {
labelStr = fmt.Sprintf("%s (SSA Block: %s):", l, blkID)
} else {
labelStr = fmt.Sprintf("%s:", l)
}
lines = append(lines, labelStr)
}
if cur.kind == nop0 {
continue
}
lines = append(lines, "\t"+cur.String())
}
return "\n" + strings.Join(lines, "\n") + "\n"
}
// InsertReturn implements backend.Machine.
func (m *machine) InsertReturn() {
i := m.allocateInstr()
i.asRet()
m.insert(i)
}
func (m *machine) getVRegSpillSlotOffsetFromSP(id regalloc.VRegID, size byte) int64 {
offset, ok := m.spillSlots[id]
if !ok {
offset = m.spillSlotSize
// TODO: this should be aligned depending on the `size` to use Imm12 offset load/store as much as possible.
m.spillSlots[id] = offset
m.spillSlotSize += int64(size)
}
return offset + 16 // spill slot starts above the clobbered registers and the frame size.
}
func (m *machine) clobberedRegSlotSize() int64 {
return int64(len(m.clobberedRegs) * 16)
}
func (m *machine) arg0OffsetFromSP() int64 {
return m.frameSize() +
16 + // 16-byte aligned return address
16 // frame size saved below the clobbered registers.
}
func (m *machine) ret0OffsetFromSP() int64 {
return m.arg0OffsetFromSP() + m.currentABI.ArgStackSize
}
func (m *machine) requiredStackSize() int64 {
return m.maxRequiredStackSizeForCalls +
m.frameSize() +
16 + // 16-byte aligned return address.
16 // frame size saved below the clobbered registers.
}
func (m *machine) frameSize() int64 {
s := m.clobberedRegSlotSize() + m.spillSlotSize
if s&0xf != 0 {
panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", s))
}
return s
}
func (m *machine) addJmpTableTarget(targets []ssa.BasicBlock) (index int) {
// TODO: reuse the slice!
labels := make([]uint32, len(targets))
for j, target := range targets {
labels[j] = uint32(m.executableContext.GetOrAllocateSSABlockLabel(target))
}
index = len(m.jmpTableTargets)
m.jmpTableTargets = append(m.jmpTableTargets, labels)
return
}

View File

@ -0,0 +1,469 @@
package arm64
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
// PostRegAlloc implements backend.Machine.
func (m *machine) PostRegAlloc() {
m.setupPrologue()
m.postRegAlloc()
}
// setupPrologue initializes the prologue of the function.
func (m *machine) setupPrologue() {
ectx := m.executableContext
cur := ectx.RootInstr
prevInitInst := cur.next
//
// (high address) (high address)
// SP----> +-----------------+ +------------------+ <----+
// | ....... | | ....... | |
// | ret Y | | ret Y | |
// | ....... | | ....... | |
// | ret 0 | | ret 0 | |
// | arg X | | arg X | | size_of_arg_ret.
// | ....... | ====> | ....... | |
// | arg 1 | | arg 1 | |
// | arg 0 | | arg 0 | <----+
// |-----------------| | size_of_arg_ret |
// | return address |
// +------------------+ <---- SP
// (low address) (low address)
// Saves the return address (lr) and the size_of_arg_ret below the SP.
// size_of_arg_ret is used for stack unwinding.
cur = m.createReturnAddrAndSizeOfArgRetSlot(cur)
if !m.stackBoundsCheckDisabled {
cur = m.insertStackBoundsCheck(m.requiredStackSize(), cur)
}
// Decrement SP if spillSlotSize > 0.
if m.spillSlotSize == 0 && len(m.spillSlots) != 0 {
panic(fmt.Sprintf("BUG: spillSlotSize=%d, spillSlots=%v\n", m.spillSlotSize, m.spillSlots))
}
if regs := m.clobberedRegs; len(regs) > 0 {
//
// (high address) (high address)
// +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | size_of_arg_ret | | size_of_arg_ret |
// | ReturnAddress | | ReturnAddress |
// SP----> +-----------------+ ====> +-----------------+
// (low address) | clobbered M |
// | ............ |
// | clobbered 0 |
// +-----------------+ <----- SP
// (low address)
//
_amode := addressModePreOrPostIndex(spVReg,
-16, // stack pointer must be 16-byte aligned.
true, // Decrement before store.
)
for _, vr := range regs {
// TODO: pair stores to reduce the number of instructions.
store := m.allocateInstr()
store.asStore(operandNR(vr), _amode, regTypeToRegisterSizeInBits(vr.RegType()))
cur = linkInstr(cur, store)
}
}
if size := m.spillSlotSize; size > 0 {
// Check if size is 16-byte aligned.
if size&0xf != 0 {
panic(fmt.Errorf("BUG: spill slot size %d is not 16-byte aligned", size))
}
cur = m.addsAddOrSubStackPointer(cur, spVReg, size, false)
// At this point, the stack looks like:
//
// (high address)
// +------------------+
// | ....... |
// | ret Y |
// | ....... |
// | ret 0 |
// | arg X |
// | ....... |
// | arg 1 |
// | arg 0 |
// | size_of_arg_ret |
// | ReturnAddress |
// +------------------+
// | clobbered M |
// | ............ |
// | clobbered 0 |
// | spill slot N |
// | ............ |
// | spill slot 2 |
// | spill slot 0 |
// SP----> +------------------+
// (low address)
}
// We push the frame size into the stack to make it possible to unwind stack:
//
//
// (high address) (high address)
// +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | size_of_arg_ret | | size_of_arg_ret |
// | ReturnAddress | | ReturnAddress |
// +-----------------+ ==> +-----------------+ <----+
// | clobbered M | | clobbered M | |
// | ............ | | ............ | |
// | clobbered 2 | | clobbered 2 | |
// | clobbered 1 | | clobbered 1 | | frame size
// | clobbered 0 | | clobbered 0 | |
// | spill slot N | | spill slot N | |
// | ............ | | ............ | |
// | spill slot 0 | | spill slot 0 | <----+
// SP---> +-----------------+ | xxxxxx | ;; unused space to make it 16-byte aligned.
// | frame_size |
// +-----------------+ <---- SP
// (low address)
//
cur = m.createFrameSizeSlot(cur, m.frameSize())
linkInstr(cur, prevInitInst)
}
func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruction {
// First we decrement the stack pointer to point the arg0 slot.
var sizeOfArgRetReg regalloc.VReg
s := int64(m.currentABI.AlignedArgResultStackSlotSize())
if s > 0 {
cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
sizeOfArgRetReg = tmpRegVReg
subSp := m.allocateInstr()
subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
cur = linkInstr(cur, subSp)
} else {
sizeOfArgRetReg = xzrVReg
}
// Saves the return address (lr) and the size_of_arg_ret below the SP.
// size_of_arg_ret is used for stack unwinding.
pstr := m.allocateInstr()
amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
cur = linkInstr(cur, pstr)
return cur
}
func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
var frameSizeReg regalloc.VReg
if s > 0 {
cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, s)
frameSizeReg = tmpRegVReg
} else {
frameSizeReg = xzrVReg
}
_amode := addressModePreOrPostIndex(spVReg,
-16, // stack pointer must be 16-byte aligned.
true, // Decrement before store.
)
store := m.allocateInstr()
store.asStore(operandNR(frameSizeReg), _amode, 64)
cur = linkInstr(cur, store)
return cur
}
// postRegAlloc does multiple things while walking through the instructions:
// 1. Removes the redundant copy instruction.
// 2. Inserts the epilogue.
func (m *machine) postRegAlloc() {
ectx := m.executableContext
for cur := ectx.RootInstr; cur != nil; cur = cur.next {
switch cur.kind {
case ret:
m.setupEpilogueAfter(cur.prev)
case loadConstBlockArg:
lc := cur
next := lc.next
m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
m.lowerLoadConstantBlockArgAfterRegAlloc(lc)
for _, instr := range m.executableContext.PendingInstructions {
cur = linkInstr(cur, instr)
}
linkInstr(cur, next)
m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
default:
// Removes the redundant copy instruction.
if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
prev, next := cur.prev, cur.next
// Remove the copy instruction.
prev.next = next
if next != nil {
next.prev = prev
}
}
}
}
}
func (m *machine) setupEpilogueAfter(cur *instruction) {
prevNext := cur.next
// We've stored the frame size in the prologue, and now that we are about to return from this function, we won't need it anymore.
cur = m.addsAddOrSubStackPointer(cur, spVReg, 16, true)
if s := m.spillSlotSize; s > 0 {
// Adjust SP to the original value:
//
// (high address) (high address)
// +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | xxxxx | | xxxxx |
// | ReturnAddress | | ReturnAddress |
// +-----------------+ ====> +-----------------+
// | clobbered M | | clobbered M |
// | ............ | | ............ |
// | clobbered 1 | | clobbered 1 |
// | clobbered 0 | | clobbered 0 |
// | spill slot N | +-----------------+ <---- SP
// | ............ |
// | spill slot 0 |
// SP---> +-----------------+
// (low address)
//
cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
}
// First we need to restore the clobbered registers.
if len(m.clobberedRegs) > 0 {
// (high address)
// +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | xxxxx | | xxxxx |
// | ReturnAddress | | ReturnAddress |
// +-----------------+ ========> +-----------------+ <---- SP
// | clobbered M |
// | ........... |
// | clobbered 1 |
// | clobbered 0 |
// SP---> +-----------------+
// (low address)
l := len(m.clobberedRegs) - 1
for i := range m.clobberedRegs {
vr := m.clobberedRegs[l-i] // reverse order to restore.
load := m.allocateInstr()
amode := addressModePreOrPostIndex(spVReg,
16, // stack pointer must be 16-byte aligned.
false, // Increment after store.
)
// TODO: pair loads to reduce the number of instructions.
switch regTypeToRegisterSizeInBits(vr.RegType()) {
case 64: // save int reg.
load.asULoad(operandNR(vr), amode, 64)
case 128: // save vector reg.
load.asFpuLoad(operandNR(vr), amode, 128)
}
cur = linkInstr(cur, load)
}
}
// Reload the return address (lr).
//
// +-----------------+ +-----------------+
// | ....... | | ....... |
// | ret Y | | ret Y |
// | ....... | | ....... |
// | ret 0 | | ret 0 |
// | arg X | | arg X |
// | ....... | ===> | ....... |
// | arg 1 | | arg 1 |
// | arg 0 | | arg 0 |
// | xxxxx | +-----------------+ <---- SP
// | ReturnAddress |
// SP----> +-----------------+
ldr := m.allocateInstr()
ldr.asULoad(operandNR(lrVReg),
addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
cur = linkInstr(cur, ldr)
if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
cur = m.addsAddOrSubStackPointer(cur, spVReg, s, true)
}
linkInstr(cur, prevNext)
}
// saveRequiredRegs is the set of registers that must be saved/restored during growing stack when there's insufficient
// stack space left. Basically this is the combination of CalleeSavedRegisters plus argument registers execpt for x0,
// which always points to the execution context whenever the native code is entered from Go.
var saveRequiredRegs = []regalloc.VReg{
x1VReg, x2VReg, x3VReg, x4VReg, x5VReg, x6VReg, x7VReg,
x19VReg, x20VReg, x21VReg, x22VReg, x23VReg, x24VReg, x25VReg, x26VReg, x28VReg, lrVReg,
v0VReg, v1VReg, v2VReg, v3VReg, v4VReg, v5VReg, v6VReg, v7VReg,
v18VReg, v19VReg, v20VReg, v21VReg, v22VReg, v23VReg, v24VReg, v25VReg, v26VReg, v27VReg, v28VReg, v29VReg, v30VReg, v31VReg,
}
// insertStackBoundsCheck will insert the instructions after `cur` to check the
// stack bounds, and if there's no sufficient spaces required for the function,
// exit the execution and try growing it in Go world.
//
// TODO: we should be able to share the instructions across all the functions to reduce the size of compiled executable.
func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instruction) *instruction {
if requiredStackSize%16 != 0 {
panic("BUG")
}
if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
// sub tmp, sp, #requiredStackSize
sub := m.allocateInstr()
sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
cur = linkInstr(cur, sub)
} else {
// This case, we first load the requiredStackSize into the temporary register,
cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
// Then subtract it.
sub := m.allocateInstr()
sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
cur = linkInstr(cur, sub)
}
tmp2 := x11VReg // Caller save, so it is safe to use it here in the prologue.
// ldr tmp2, [executionContext #StackBottomPtr]
ldr := m.allocateInstr()
ldr.asULoad(operandNR(tmp2), addressMode{
kind: addressModeKindRegUnsignedImm12,
rn: x0VReg, // execution context is always the first argument.
imm: wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
}, 64)
cur = linkInstr(cur, ldr)
// subs xzr, tmp, tmp2
subs := m.allocateInstr()
subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
cur = linkInstr(cur, subs)
// b.ge #imm
cbr := m.allocateInstr()
cbr.asCondBr(ge.asCond(), labelInvalid, false /* ignored */)
cur = linkInstr(cur, cbr)
// Set the required stack size and set it to the exec context.
{
// First load the requiredStackSize into the temporary register,
cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
setRequiredStackSize := m.allocateInstr()
setRequiredStackSize.asStore(operandNR(tmpRegVReg),
addressMode{
kind: addressModeKindRegUnsignedImm12,
// Execution context is always the first argument.
rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
}, 64)
cur = linkInstr(cur, setRequiredStackSize)
}
ldrAddress := m.allocateInstr()
ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
kind: addressModeKindRegUnsignedImm12,
rn: x0VReg, // execution context is always the first argument
imm: wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
}, 64)
cur = linkInstr(cur, ldrAddress)
// Then jumps to the stack grow call sequence's address, meaning
// transferring the control to the code compiled by CompileStackGrowCallSequence.
bl := m.allocateInstr()
bl.asCallIndirect(tmpRegVReg, nil)
cur = linkInstr(cur, bl)
// Now that we know the entire code, we can finalize how many bytes
// we have to skip when the stack size is sufficient.
var cbrOffset int64
for _cur := cbr; ; _cur = _cur.next {
cbrOffset += _cur.size()
if _cur == cur {
break
}
}
cbr.condBrOffsetResolve(cbrOffset)
return cur
}
// CompileStackGrowCallSequence implements backend.Machine.
func (m *machine) CompileStackGrowCallSequence() []byte {
ectx := m.executableContext
cur := m.allocateInstr()
cur.asNop0()
ectx.RootInstr = cur
// Save the callee saved and argument registers.
cur = m.saveRegistersInExecutionContext(cur, saveRequiredRegs)
// Save the current stack pointer.
cur = m.saveCurrentStackPointer(cur, x0VReg)
// Set the exit status on the execution context.
cur = m.setExitCode(cur, x0VReg, wazevoapi.ExitCodeGrowStack)
// Exit the execution.
cur = m.storeReturnAddressAndExit(cur)
// After the exit, restore the saved registers.
cur = m.restoreRegistersInExecutionContext(cur, saveRequiredRegs)
// Then goes back the original address of this stack grow call.
ret := m.allocateInstr()
ret.asRet()
linkInstr(cur, ret)
m.encode(ectx.RootInstr)
return m.compiler.Buf()
}
func (m *machine) addsAddOrSubStackPointer(cur *instruction, rd regalloc.VReg, diff int64, add bool) *instruction {
ectx := m.executableContext
ectx.PendingInstructions = ectx.PendingInstructions[:0]
m.insertAddOrSubStackPointer(rd, diff, add)
for _, inserted := range ectx.PendingInstructions {
cur = linkInstr(cur, inserted)
}
return cur
}

View File

@ -0,0 +1,152 @@
package arm64
// This file implements the interfaces required for register allocations. See backend.RegAllocFunctionMachine.
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// ClobberedRegisters implements backend.RegAllocFunctionMachine.
func (m *machine) ClobberedRegisters(regs []regalloc.VReg) {
m.clobberedRegs = append(m.clobberedRegs[:0], regs...)
}
// Swap implements backend.RegAllocFunctionMachine.
func (m *machine) Swap(cur *instruction, x1, x2, tmp regalloc.VReg) {
prevNext := cur.next
var mov1, mov2, mov3 *instruction
if x1.RegType() == regalloc.RegTypeInt {
if !tmp.Valid() {
tmp = tmpRegVReg
}
mov1 = m.allocateInstr().asMove64(tmp, x1)
mov2 = m.allocateInstr().asMove64(x1, x2)
mov3 = m.allocateInstr().asMove64(x2, tmp)
cur = linkInstr(cur, mov1)
cur = linkInstr(cur, mov2)
cur = linkInstr(cur, mov3)
linkInstr(cur, prevNext)
} else {
if !tmp.Valid() {
r2 := x2.RealReg()
// Temporarily spill x1 to stack.
cur = m.InsertStoreRegisterAt(x1, cur, true).prev
// Then move x2 to x1.
cur = linkInstr(cur, m.allocateInstr().asFpuMov128(x1, x2))
linkInstr(cur, prevNext)
// Then reload the original value on x1 from stack to r2.
m.InsertReloadRegisterAt(x1.SetRealReg(r2), cur, true)
} else {
mov1 = m.allocateInstr().asFpuMov128(tmp, x1)
mov2 = m.allocateInstr().asFpuMov128(x1, x2)
mov3 = m.allocateInstr().asFpuMov128(x2, tmp)
cur = linkInstr(cur, mov1)
cur = linkInstr(cur, mov2)
cur = linkInstr(cur, mov3)
linkInstr(cur, prevNext)
}
}
}
// InsertMoveBefore implements backend.RegAllocFunctionMachine.
func (m *machine) InsertMoveBefore(dst, src regalloc.VReg, instr *instruction) {
typ := src.RegType()
if typ != dst.RegType() {
panic("BUG: src and dst must have the same type")
}
mov := m.allocateInstr()
if typ == regalloc.RegTypeInt {
mov.asMove64(dst, src)
} else {
mov.asFpuMov128(dst, src)
}
cur := instr.prev
prevNext := cur.next
cur = linkInstr(cur, mov)
linkInstr(cur, prevNext)
}
// SSABlockLabel implements backend.RegAllocFunctionMachine.
func (m *machine) SSABlockLabel(id ssa.BasicBlockID) backend.Label {
return m.executableContext.SsaBlockIDToLabels[id]
}
// InsertStoreRegisterAt implements backend.RegAllocFunctionMachine.
func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
if !v.IsRealReg() {
panic("BUG: VReg must be backed by real reg to be stored")
}
typ := m.compiler.TypeOf(v)
var prevNext, cur *instruction
if after {
cur, prevNext = instr, instr.next
} else {
cur, prevNext = instr.prev, instr
}
offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
var amode addressMode
cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
store := m.allocateInstr()
store.asStore(operandNR(v), amode, typ.Bits())
cur = linkInstr(cur, store)
return linkInstr(cur, prevNext)
}
// InsertReloadRegisterAt implements backend.RegAllocFunctionMachine.
func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, after bool) *instruction {
if !v.IsRealReg() {
panic("BUG: VReg must be backed by real reg to be stored")
}
typ := m.compiler.TypeOf(v)
var prevNext, cur *instruction
if after {
cur, prevNext = instr, instr.next
} else {
cur, prevNext = instr.prev, instr
}
offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
var amode addressMode
cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
load := m.allocateInstr()
switch typ {
case ssa.TypeI32, ssa.TypeI64:
load.asULoad(operandNR(v), amode, typ.Bits())
case ssa.TypeF32, ssa.TypeF64:
load.asFpuLoad(operandNR(v), amode, typ.Bits())
case ssa.TypeV128:
load.asFpuLoad(operandNR(v), amode, 128)
default:
panic("TODO")
}
cur = linkInstr(cur, load)
return linkInstr(cur, prevNext)
}
// LastInstrForInsertion implements backend.RegAllocFunctionMachine.
func (m *machine) LastInstrForInsertion(begin, end *instruction) *instruction {
cur := end
for cur.kind == nop0 {
cur = cur.prev
if cur == begin {
return end
}
}
switch cur.kind {
case br:
return cur
default:
return end
}
}

View File

@ -0,0 +1,117 @@
package arm64
import (
"encoding/binary"
"fmt"
"math"
"sort"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
)
const (
// trampolineCallSize is the size of the trampoline instruction sequence for each function in an island.
trampolineCallSize = 4*4 + 4 // Four instructions + 32-bit immediate.
// Unconditional branch offset is encoded as divided by 4 in imm26.
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
maxUnconditionalBranchOffset = maxSignedInt26 * 4
minUnconditionalBranchOffset = minSignedInt26 * 4
// trampolineIslandInterval is the range of the trampoline island.
// Half of the range is used for the trampoline island, and the other half is used for the function.
trampolineIslandInterval = maxUnconditionalBranchOffset / 2
// maxNumFunctions explicitly specifies the maximum number of functions that can be allowed in a single executable.
maxNumFunctions = trampolineIslandInterval >> 6
// maxFunctionExecutableSize is the maximum size of a function that can exist in a trampoline island.
// Conservatively set to 1/4 of the trampoline island interval.
maxFunctionExecutableSize = trampolineIslandInterval >> 2
)
// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
func (m *machine) CallTrampolineIslandInfo(numFunctions int) (interval, size int, err error) {
if numFunctions > maxNumFunctions {
return 0, 0, fmt.Errorf("too many functions: %d > %d", numFunctions, maxNumFunctions)
}
return trampolineIslandInterval, trampolineCallSize * numFunctions, nil
}
// ResolveRelocations implements backend.Machine ResolveRelocations.
func (m *machine) ResolveRelocations(
refToBinaryOffset []int,
executable []byte,
relocations []backend.RelocationInfo,
callTrampolineIslandOffsets []int,
) {
for _, islandOffset := range callTrampolineIslandOffsets {
encodeCallTrampolineIsland(refToBinaryOffset, islandOffset, executable)
}
for _, r := range relocations {
instrOffset := r.Offset
calleeFnOffset := refToBinaryOffset[r.FuncRef]
diff := int64(calleeFnOffset) - (instrOffset)
// Check if the diff is within the range of the branch instruction.
if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
// Find the near trampoline island from callTrampolineIslandOffsets.
islandOffset := searchTrampolineIsland(callTrampolineIslandOffsets, int(instrOffset))
islandTargetOffset := islandOffset + trampolineCallSize*int(r.FuncRef)
diff = int64(islandTargetOffset) - (instrOffset)
if diff < minUnconditionalBranchOffset || diff > maxUnconditionalBranchOffset {
panic("BUG in trampoline placement")
}
}
binary.LittleEndian.PutUint32(executable[instrOffset:instrOffset+4], encodeUnconditionalBranch(true, diff))
}
}
// encodeCallTrampolineIsland encodes a trampoline island for the given functions.
// Each island consists of a trampoline instruction sequence for each function.
// Each trampoline instruction sequence consists of 4 instructions + 32-bit immediate.
func encodeCallTrampolineIsland(refToBinaryOffset []int, islandOffset int, executable []byte) {
for i := 0; i < len(refToBinaryOffset); i++ {
trampolineOffset := islandOffset + trampolineCallSize*i
fnOffset := refToBinaryOffset[i]
diff := fnOffset - (trampolineOffset + 16)
if diff > math.MaxInt32 || diff < math.MinInt32 {
// This case even amd64 can't handle. 4GB is too big.
panic("too big binary")
}
// The tmpReg, tmpReg2 is safe to overwrite (in fact any caller-saved register is safe to use).
tmpReg, tmpReg2 := regNumberInEncoding[tmpRegVReg.RealReg()], regNumberInEncoding[x11]
// adr tmpReg, PC+16: load the address of #diff into tmpReg.
binary.LittleEndian.PutUint32(executable[trampolineOffset:], encodeAdr(tmpReg, 16))
// ldrsw tmpReg2, [tmpReg]: Load #diff into tmpReg2.
binary.LittleEndian.PutUint32(executable[trampolineOffset+4:],
encodeLoadOrStore(sLoad32, tmpReg2, addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpRegVReg}))
// add tmpReg, tmpReg2, tmpReg: add #diff to the address of #diff, getting the absolute address of the function.
binary.LittleEndian.PutUint32(executable[trampolineOffset+8:],
encodeAluRRR(aluOpAdd, tmpReg, tmpReg, tmpReg2, true, false))
// br tmpReg: branch to the function without overwriting the link register.
binary.LittleEndian.PutUint32(executable[trampolineOffset+12:], encodeUnconditionalBranchReg(tmpReg, false))
// #diff
binary.LittleEndian.PutUint32(executable[trampolineOffset+16:], uint32(diff))
}
}
// searchTrampolineIsland finds the nearest trampoline island from callTrampolineIslandOffsets.
// Note that even if the offset is in the middle of two islands, it returns the latter one.
// That is ok because the island is always placed in the middle of the range.
//
// precondition: callTrampolineIslandOffsets is sorted in ascending order.
func searchTrampolineIsland(callTrampolineIslandOffsets []int, offset int) int {
l := len(callTrampolineIslandOffsets)
n := sort.Search(l, func(i int) bool {
return callTrampolineIslandOffsets[i] >= offset
})
if n == l {
n = l - 1
}
return callTrampolineIslandOffsets[n]
}

View File

@ -0,0 +1,397 @@
package arm64
import (
"fmt"
"strconv"
"strings"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
)
// Arm64-specific registers.
//
// See https://developer.arm.com/documentation/dui0801/a/Overview-of-AArch64-state/Predeclared-core-register-names-in-AArch64-state
const (
// General purpose registers. Note that we do not distinguish wn and xn registers
// because they are the same from the perspective of register allocator, and
// the size can be determined by the type of the instruction.
x0 = regalloc.RealRegInvalid + 1 + iota
x1
x2
x3
x4
x5
x6
x7
x8
x9
x10
x11
x12
x13
x14
x15
x16
x17
x18
x19
x20
x21
x22
x23
x24
x25
x26
x27
x28
x29
x30
// Vector registers. Note that we do not distinguish vn and dn, ... registers
// because they are the same from the perspective of register allocator, and
// the size can be determined by the type of the instruction.
v0
v1
v2
v3
v4
v5
v6
v7
v8
v9
v10
v11
v12
v13
v14
v15
v16
v17
v18
v19
v20
v21
v22
v23
v24
v25
v26
v27
v28
v29
v30
v31
// Special registers
xzr
sp
lr = x30
fp = x29
tmp = x27
)
var (
x0VReg = regalloc.FromRealReg(x0, regalloc.RegTypeInt)
x1VReg = regalloc.FromRealReg(x1, regalloc.RegTypeInt)
x2VReg = regalloc.FromRealReg(x2, regalloc.RegTypeInt)
x3VReg = regalloc.FromRealReg(x3, regalloc.RegTypeInt)
x4VReg = regalloc.FromRealReg(x4, regalloc.RegTypeInt)
x5VReg = regalloc.FromRealReg(x5, regalloc.RegTypeInt)
x6VReg = regalloc.FromRealReg(x6, regalloc.RegTypeInt)
x7VReg = regalloc.FromRealReg(x7, regalloc.RegTypeInt)
x8VReg = regalloc.FromRealReg(x8, regalloc.RegTypeInt)
x9VReg = regalloc.FromRealReg(x9, regalloc.RegTypeInt)
x10VReg = regalloc.FromRealReg(x10, regalloc.RegTypeInt)
x11VReg = regalloc.FromRealReg(x11, regalloc.RegTypeInt)
x12VReg = regalloc.FromRealReg(x12, regalloc.RegTypeInt)
x13VReg = regalloc.FromRealReg(x13, regalloc.RegTypeInt)
x14VReg = regalloc.FromRealReg(x14, regalloc.RegTypeInt)
x15VReg = regalloc.FromRealReg(x15, regalloc.RegTypeInt)
x16VReg = regalloc.FromRealReg(x16, regalloc.RegTypeInt)
x17VReg = regalloc.FromRealReg(x17, regalloc.RegTypeInt)
x18VReg = regalloc.FromRealReg(x18, regalloc.RegTypeInt)
x19VReg = regalloc.FromRealReg(x19, regalloc.RegTypeInt)
x20VReg = regalloc.FromRealReg(x20, regalloc.RegTypeInt)
x21VReg = regalloc.FromRealReg(x21, regalloc.RegTypeInt)
x22VReg = regalloc.FromRealReg(x22, regalloc.RegTypeInt)
x23VReg = regalloc.FromRealReg(x23, regalloc.RegTypeInt)
x24VReg = regalloc.FromRealReg(x24, regalloc.RegTypeInt)
x25VReg = regalloc.FromRealReg(x25, regalloc.RegTypeInt)
x26VReg = regalloc.FromRealReg(x26, regalloc.RegTypeInt)
x27VReg = regalloc.FromRealReg(x27, regalloc.RegTypeInt)
x28VReg = regalloc.FromRealReg(x28, regalloc.RegTypeInt)
x29VReg = regalloc.FromRealReg(x29, regalloc.RegTypeInt)
x30VReg = regalloc.FromRealReg(x30, regalloc.RegTypeInt)
v0VReg = regalloc.FromRealReg(v0, regalloc.RegTypeFloat)
v1VReg = regalloc.FromRealReg(v1, regalloc.RegTypeFloat)
v2VReg = regalloc.FromRealReg(v2, regalloc.RegTypeFloat)
v3VReg = regalloc.FromRealReg(v3, regalloc.RegTypeFloat)
v4VReg = regalloc.FromRealReg(v4, regalloc.RegTypeFloat)
v5VReg = regalloc.FromRealReg(v5, regalloc.RegTypeFloat)
v6VReg = regalloc.FromRealReg(v6, regalloc.RegTypeFloat)
v7VReg = regalloc.FromRealReg(v7, regalloc.RegTypeFloat)
v8VReg = regalloc.FromRealReg(v8, regalloc.RegTypeFloat)
v9VReg = regalloc.FromRealReg(v9, regalloc.RegTypeFloat)
v10VReg = regalloc.FromRealReg(v10, regalloc.RegTypeFloat)
v11VReg = regalloc.FromRealReg(v11, regalloc.RegTypeFloat)
v12VReg = regalloc.FromRealReg(v12, regalloc.RegTypeFloat)
v13VReg = regalloc.FromRealReg(v13, regalloc.RegTypeFloat)
v14VReg = regalloc.FromRealReg(v14, regalloc.RegTypeFloat)
v15VReg = regalloc.FromRealReg(v15, regalloc.RegTypeFloat)
v16VReg = regalloc.FromRealReg(v16, regalloc.RegTypeFloat)
v17VReg = regalloc.FromRealReg(v17, regalloc.RegTypeFloat)
v18VReg = regalloc.FromRealReg(v18, regalloc.RegTypeFloat)
v19VReg = regalloc.FromRealReg(v19, regalloc.RegTypeFloat)
v20VReg = regalloc.FromRealReg(v20, regalloc.RegTypeFloat)
v21VReg = regalloc.FromRealReg(v21, regalloc.RegTypeFloat)
v22VReg = regalloc.FromRealReg(v22, regalloc.RegTypeFloat)
v23VReg = regalloc.FromRealReg(v23, regalloc.RegTypeFloat)
v24VReg = regalloc.FromRealReg(v24, regalloc.RegTypeFloat)
v25VReg = regalloc.FromRealReg(v25, regalloc.RegTypeFloat)
v26VReg = regalloc.FromRealReg(v26, regalloc.RegTypeFloat)
v27VReg = regalloc.FromRealReg(v27, regalloc.RegTypeFloat)
// lr (link register) holds the return address at the function entry.
lrVReg = x30VReg
// tmpReg is used to perform spill/load on large stack offsets, and load large constants.
// Therefore, be cautious to use this register in the middle of the compilation, especially before the register allocation.
// This is the same as golang/go, but it's only described in the source code:
// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L59
// https://github.com/golang/go/blob/18e17e2cb12837ea2c8582ecdb0cc780f49a1aac/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go#L13-L15
tmpRegVReg = regalloc.FromRealReg(tmp, regalloc.RegTypeInt)
v28VReg = regalloc.FromRealReg(v28, regalloc.RegTypeFloat)
v29VReg = regalloc.FromRealReg(v29, regalloc.RegTypeFloat)
v30VReg = regalloc.FromRealReg(v30, regalloc.RegTypeFloat)
v31VReg = regalloc.FromRealReg(v31, regalloc.RegTypeFloat)
xzrVReg = regalloc.FromRealReg(xzr, regalloc.RegTypeInt)
spVReg = regalloc.FromRealReg(sp, regalloc.RegTypeInt)
fpVReg = regalloc.FromRealReg(fp, regalloc.RegTypeInt)
)
var regNames = [...]string{
x0: "x0",
x1: "x1",
x2: "x2",
x3: "x3",
x4: "x4",
x5: "x5",
x6: "x6",
x7: "x7",
x8: "x8",
x9: "x9",
x10: "x10",
x11: "x11",
x12: "x12",
x13: "x13",
x14: "x14",
x15: "x15",
x16: "x16",
x17: "x17",
x18: "x18",
x19: "x19",
x20: "x20",
x21: "x21",
x22: "x22",
x23: "x23",
x24: "x24",
x25: "x25",
x26: "x26",
x27: "x27",
x28: "x28",
x29: "x29",
x30: "x30",
xzr: "xzr",
sp: "sp",
v0: "v0",
v1: "v1",
v2: "v2",
v3: "v3",
v4: "v4",
v5: "v5",
v6: "v6",
v7: "v7",
v8: "v8",
v9: "v9",
v10: "v10",
v11: "v11",
v12: "v12",
v13: "v13",
v14: "v14",
v15: "v15",
v16: "v16",
v17: "v17",
v18: "v18",
v19: "v19",
v20: "v20",
v21: "v21",
v22: "v22",
v23: "v23",
v24: "v24",
v25: "v25",
v26: "v26",
v27: "v27",
v28: "v28",
v29: "v29",
v30: "v30",
v31: "v31",
}
func formatVRegSized(r regalloc.VReg, size byte) (ret string) {
if r.IsRealReg() {
ret = regNames[r.RealReg()]
switch ret[0] {
case 'x':
switch size {
case 32:
ret = strings.Replace(ret, "x", "w", 1)
case 64:
default:
panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
}
case 'v':
switch size {
case 32:
ret = strings.Replace(ret, "v", "s", 1)
case 64:
ret = strings.Replace(ret, "v", "d", 1)
case 128:
ret = strings.Replace(ret, "v", "q", 1)
default:
panic("BUG: invalid register size")
}
}
} else {
switch r.RegType() {
case regalloc.RegTypeInt:
switch size {
case 32:
ret = fmt.Sprintf("w%d?", r.ID())
case 64:
ret = fmt.Sprintf("x%d?", r.ID())
default:
panic("BUG: invalid register size: " + strconv.Itoa(int(size)))
}
case regalloc.RegTypeFloat:
switch size {
case 32:
ret = fmt.Sprintf("s%d?", r.ID())
case 64:
ret = fmt.Sprintf("d%d?", r.ID())
case 128:
ret = fmt.Sprintf("q%d?", r.ID())
default:
panic("BUG: invalid register size")
}
default:
panic(fmt.Sprintf("BUG: invalid register type: %d for %s", r.RegType(), r))
}
}
return
}
func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) {
var id string
wspec := strings.ToLower(width.String())
if r.IsRealReg() {
id = regNames[r.RealReg()][1:]
} else {
id = fmt.Sprintf("%d?", r.ID())
}
ret = fmt.Sprintf("%s%s", wspec, id)
return
}
func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) {
id := fmt.Sprintf("v%d?", r.ID())
if r.IsRealReg() {
id = regNames[r.RealReg()]
}
ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String()))
if index != vecIndexNone {
ret += fmt.Sprintf("[%d]", index)
}
return
}
func regTypeToRegisterSizeInBits(r regalloc.RegType) byte {
switch r {
case regalloc.RegTypeInt:
return 64
case regalloc.RegTypeFloat:
return 128
default:
panic("BUG: invalid register type")
}
}
var regNumberInEncoding = [...]uint32{
x0: 0,
x1: 1,
x2: 2,
x3: 3,
x4: 4,
x5: 5,
x6: 6,
x7: 7,
x8: 8,
x9: 9,
x10: 10,
x11: 11,
x12: 12,
x13: 13,
x14: 14,
x15: 15,
x16: 16,
x17: 17,
x18: 18,
x19: 19,
x20: 20,
x21: 21,
x22: 22,
x23: 23,
x24: 24,
x25: 25,
x26: 26,
x27: 27,
x28: 28,
x29: 29,
x30: 30,
xzr: 31,
sp: 31,
v0: 0,
v1: 1,
v2: 2,
v3: 3,
v4: 4,
v5: 5,
v6: 6,
v7: 7,
v8: 8,
v9: 9,
v10: 10,
v11: 11,
v12: 12,
v13: 13,
v14: 14,
v15: 15,
v16: 16,
v17: 17,
v18: 18,
v19: 19,
v20: 20,
v21: 21,
v22: 22,
v23: 23,
v24: 24,
v25: 25,
v26: 26,
v27: 27,
v28: 28,
v29: 29,
v30: 30,
v31: 31,
}

View File

@ -0,0 +1,90 @@
package arm64
import (
"encoding/binary"
"reflect"
"unsafe"
"github.com/tetratelabs/wazero/internal/wasmdebug"
)
// UnwindStack implements wazevo.unwindStack.
func UnwindStack(sp, _, top uintptr, returnAddresses []uintptr) []uintptr {
l := int(top - sp)
var stackBuf []byte
{
// TODO: use unsafe.Slice after floor version is set to Go 1.20.
hdr := (*reflect.SliceHeader)(unsafe.Pointer(&stackBuf))
hdr.Data = sp
hdr.Len = l
hdr.Cap = l
}
for i := uint64(0); i < uint64(l); {
// (high address)
// +-----------------+
// | ....... |
// | ret Y | <----+
// | ....... | |
// | ret 0 | |
// | arg X | | size_of_arg_ret
// | ....... | |
// | arg 1 | |
// | arg 0 | <----+
// | size_of_arg_ret |
// | ReturnAddress |
// +-----------------+ <----+
// | ........... | |
// | spill slot M | |
// | ............ | |
// | spill slot 2 | |
// | spill slot 1 | | frame size
// | spill slot 1 | |
// | clobbered N | |
// | ............ | |
// | clobbered 0 | <----+
// | xxxxxx | ;; unused space to make it 16-byte aligned.
// | frame_size |
// +-----------------+ <---- SP
// (low address)
frameSize := binary.LittleEndian.Uint64(stackBuf[i:])
i += frameSize +
16 // frame size + aligned space.
retAddr := binary.LittleEndian.Uint64(stackBuf[i:])
i += 8 // ret addr.
sizeOfArgRet := binary.LittleEndian.Uint64(stackBuf[i:])
i += 8 + sizeOfArgRet
returnAddresses = append(returnAddresses, uintptr(retAddr))
if len(returnAddresses) == wasmdebug.MaxFrames {
break
}
}
return returnAddresses
}
// GoCallStackView implements wazevo.goCallStackView.
func GoCallStackView(stackPointerBeforeGoCall *uint64) []uint64 {
// (high address)
// +-----------------+ <----+
// | xxxxxxxxxxx | | ;; optional unused space to make it 16-byte aligned.
// ^ | arg[N]/ret[M] | |
// sliceSize | | ............ | | sliceSize
// | | arg[1]/ret[1] | |
// v | arg[0]/ret[0] | <----+
// | sliceSize |
// | frame_size |
// +-----------------+ <---- stackPointerBeforeGoCall
// (low address)
ptr := unsafe.Pointer(stackPointerBeforeGoCall)
size := *(*uint64)(unsafe.Add(ptr, 8))
var view []uint64
{
sh := (*reflect.SliceHeader)(unsafe.Pointer(&view))
sh.Data = uintptr(unsafe.Add(ptr, 16)) // skips the (frame_size, sliceSize).
sh.Len = int(size)
sh.Cap = int(size)
}
return view
}

View File

@ -0,0 +1,100 @@
package backend
import (
"context"
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
)
type (
// Machine is a backend for a specific ISA machine.
Machine interface {
ExecutableContext() ExecutableContext
// DisableStackCheck disables the stack check for the current compilation for debugging/testing.
DisableStackCheck()
// SetCurrentABI initializes the FunctionABI for the given signature.
SetCurrentABI(abi *FunctionABI)
// SetCompiler sets the compilation context used for the lifetime of Machine.
// This is only called once per Machine, i.e. before the first compilation.
SetCompiler(Compiler)
// LowerSingleBranch is called when the compilation of the given single branch is started.
LowerSingleBranch(b *ssa.Instruction)
// LowerConditionalBranch is called when the compilation of the given conditional branch is started.
LowerConditionalBranch(b *ssa.Instruction)
// LowerInstr is called for each instruction in the given block except for the ones marked as already lowered
// via Compiler.MarkLowered. The order is reverse, i.e. from the last instruction to the first one.
//
// Note: this can lower multiple instructions (which produce the inputs) at once whenever it's possible
// for optimization.
LowerInstr(*ssa.Instruction)
// Reset resets the machine state for the next compilation.
Reset()
// InsertMove inserts a move instruction from src to dst whose type is typ.
InsertMove(dst, src regalloc.VReg, typ ssa.Type)
// InsertReturn inserts the return instruction to return from the current function.
InsertReturn()
// InsertLoadConstantBlockArg inserts the instruction(s) to load the constant value into the given regalloc.VReg.
InsertLoadConstantBlockArg(instr *ssa.Instruction, vr regalloc.VReg)
// Format returns the string representation of the currently compiled machine code.
// This is only for testing purpose.
Format() string
// RegAlloc does the register allocation after lowering.
RegAlloc()
// PostRegAlloc does the post register allocation, e.g. setting up prologue/epilogue, redundant move elimination, etc.
PostRegAlloc()
// ResolveRelocations resolves the relocations after emitting machine code.
// * refToBinaryOffset: the map from the function reference (ssa.FuncRef) to the executable offset.
// * executable: the binary to resolve the relocations.
// * relocations: the relocations to resolve.
// * callTrampolineIslandOffsets: the offsets of the trampoline islands in the executable.
ResolveRelocations(
refToBinaryOffset []int,
executable []byte,
relocations []RelocationInfo,
callTrampolineIslandOffsets []int,
)
// Encode encodes the machine instructions to the Compiler.
Encode(ctx context.Context) error
// CompileGoFunctionTrampoline compiles the trampoline function to call a Go function of the given exit code and signature.
CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *ssa.Signature, needModuleContextPtr bool) []byte
// CompileStackGrowCallSequence returns the sequence of instructions shared by all functions to
// call the stack grow builtin function.
CompileStackGrowCallSequence() []byte
// CompileEntryPreamble returns the sequence of instructions shared by multiple functions to
// enter the function from Go.
CompileEntryPreamble(signature *ssa.Signature) []byte
// LowerParams lowers the given parameters.
LowerParams(params []ssa.Value)
// LowerReturns lowers the given returns.
LowerReturns(returns []ssa.Value)
// ArgsResultsRegs returns the registers used for arguments and return values.
ArgsResultsRegs() (argResultInts, argResultFloats []regalloc.RealReg)
// CallTrampolineIslandInfo returns the interval of the offset where the trampoline island is placed, and
// the size of the trampoline island. If islandSize is zero, the trampoline island is not used on this machine.
CallTrampolineIslandInfo(numFunctions int) (interval, islandSize int, err error)
}
)

View File

@ -0,0 +1,319 @@
package backend
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// RegAllocFunctionMachine is the interface for the machine specific logic that will be used in RegAllocFunction.
type RegAllocFunctionMachine[I regalloc.InstrConstraint] interface {
// InsertMoveBefore inserts the move instruction from src to dst before the given instruction.
InsertMoveBefore(dst, src regalloc.VReg, instr I)
// InsertStoreRegisterAt inserts the instruction(s) to store the given virtual register at the given instruction.
// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
InsertStoreRegisterAt(v regalloc.VReg, instr I, after bool) I
// InsertReloadRegisterAt inserts the instruction(s) to reload the given virtual register at the given instruction.
// If after is true, the instruction(s) will be inserted after the given instruction, otherwise before.
InsertReloadRegisterAt(v regalloc.VReg, instr I, after bool) I
// ClobberedRegisters is called when the register allocation is done and the clobbered registers are known.
ClobberedRegisters(regs []regalloc.VReg)
// Swap swaps the two virtual registers after the given instruction.
Swap(cur I, x1, x2, tmp regalloc.VReg)
// LastInstrForInsertion implements LastInstrForInsertion of regalloc.Function. See its comment for details.
LastInstrForInsertion(begin, end I) I
// SSABlockLabel returns the label of the given ssa.BasicBlockID.
SSABlockLabel(id ssa.BasicBlockID) Label
}
type (
// RegAllocFunction implements regalloc.Function.
RegAllocFunction[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
m m
ssb ssa.Builder
c Compiler
// iter is the iterator for reversePostOrderBlocks
iter int
reversePostOrderBlocks []RegAllocBlock[I, m]
// labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
labelToRegAllocBlockIndex map[Label]int
loopNestingForestRoots []ssa.BasicBlock
}
// RegAllocBlock implements regalloc.Block.
RegAllocBlock[I regalloc.InstrConstraint, m RegAllocFunctionMachine[I]] struct {
// f is the function this instruction belongs to. Used to reuse the regAllocFunctionImpl.predsSlice slice for Defs() and Uses().
f *RegAllocFunction[I, m]
sb ssa.BasicBlock
l Label
begin, end I
loopNestingForestChildren []ssa.BasicBlock
cur I
id int
cachedLastInstrForInsertion I
}
)
// NewRegAllocFunction returns a new RegAllocFunction.
func NewRegAllocFunction[I regalloc.InstrConstraint, M RegAllocFunctionMachine[I]](m M, ssb ssa.Builder, c Compiler) *RegAllocFunction[I, M] {
return &RegAllocFunction[I, M]{
m: m,
ssb: ssb,
c: c,
labelToRegAllocBlockIndex: make(map[Label]int),
}
}
// AddBlock adds a new block to the function.
func (f *RegAllocFunction[I, M]) AddBlock(sb ssa.BasicBlock, l Label, begin, end I) {
i := len(f.reversePostOrderBlocks)
f.reversePostOrderBlocks = append(f.reversePostOrderBlocks, RegAllocBlock[I, M]{
f: f,
sb: sb,
l: l,
begin: begin,
end: end,
id: int(sb.ID()),
})
f.labelToRegAllocBlockIndex[l] = i
}
// Reset resets the function for the next compilation.
func (f *RegAllocFunction[I, M]) Reset() {
f.reversePostOrderBlocks = f.reversePostOrderBlocks[:0]
f.iter = 0
}
// StoreRegisterAfter implements regalloc.Function StoreRegisterAfter.
func (f *RegAllocFunction[I, M]) StoreRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
m := f.m
m.InsertStoreRegisterAt(v, instr.(I), true)
}
// ReloadRegisterBefore implements regalloc.Function ReloadRegisterBefore.
func (f *RegAllocFunction[I, M]) ReloadRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
m := f.m
m.InsertReloadRegisterAt(v, instr.(I), false)
}
// ReloadRegisterAfter implements regalloc.Function ReloadRegisterAfter.
func (f *RegAllocFunction[I, M]) ReloadRegisterAfter(v regalloc.VReg, instr regalloc.Instr) {
m := f.m
m.InsertReloadRegisterAt(v, instr.(I), true)
}
// StoreRegisterBefore implements regalloc.Function StoreRegisterBefore.
func (f *RegAllocFunction[I, M]) StoreRegisterBefore(v regalloc.VReg, instr regalloc.Instr) {
m := f.m
m.InsertStoreRegisterAt(v, instr.(I), false)
}
// ClobberedRegisters implements regalloc.Function ClobberedRegisters.
func (f *RegAllocFunction[I, M]) ClobberedRegisters(regs []regalloc.VReg) {
f.m.ClobberedRegisters(regs)
}
// SwapBefore implements regalloc.Function SwapBefore.
func (f *RegAllocFunction[I, M]) SwapBefore(x1, x2, tmp regalloc.VReg, instr regalloc.Instr) {
f.m.Swap(instr.Prev().(I), x1, x2, tmp)
}
// PostOrderBlockIteratorBegin implements regalloc.Function PostOrderBlockIteratorBegin.
func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorBegin() regalloc.Block {
f.iter = len(f.reversePostOrderBlocks) - 1
return f.PostOrderBlockIteratorNext()
}
// PostOrderBlockIteratorNext implements regalloc.Function PostOrderBlockIteratorNext.
func (f *RegAllocFunction[I, M]) PostOrderBlockIteratorNext() regalloc.Block {
if f.iter < 0 {
return nil
}
b := &f.reversePostOrderBlocks[f.iter]
f.iter--
return b
}
// ReversePostOrderBlockIteratorBegin implements regalloc.Function ReversePostOrderBlockIteratorBegin.
func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorBegin() regalloc.Block {
f.iter = 0
return f.ReversePostOrderBlockIteratorNext()
}
// ReversePostOrderBlockIteratorNext implements regalloc.Function ReversePostOrderBlockIteratorNext.
func (f *RegAllocFunction[I, M]) ReversePostOrderBlockIteratorNext() regalloc.Block {
if f.iter >= len(f.reversePostOrderBlocks) {
return nil
}
b := &f.reversePostOrderBlocks[f.iter]
f.iter++
return b
}
// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots.
func (f *RegAllocFunction[I, M]) LoopNestingForestRoots() int {
f.loopNestingForestRoots = f.ssb.LoopNestingForestRoots()
return len(f.loopNestingForestRoots)
}
// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot.
func (f *RegAllocFunction[I, M]) LoopNestingForestRoot(i int) regalloc.Block {
blk := f.loopNestingForestRoots[i]
l := f.m.SSABlockLabel(blk.ID())
index := f.labelToRegAllocBlockIndex[l]
return &f.reversePostOrderBlocks[index]
}
// InsertMoveBefore implements regalloc.Function InsertMoveBefore.
func (f *RegAllocFunction[I, M]) InsertMoveBefore(dst, src regalloc.VReg, instr regalloc.Instr) {
f.m.InsertMoveBefore(dst, src, instr.(I))
}
// LowestCommonAncestor implements regalloc.Function LowestCommonAncestor.
func (f *RegAllocFunction[I, M]) LowestCommonAncestor(blk1, blk2 regalloc.Block) regalloc.Block {
ret := f.ssb.LowestCommonAncestor(blk1.(*RegAllocBlock[I, M]).sb, blk2.(*RegAllocBlock[I, M]).sb)
l := f.m.SSABlockLabel(ret.ID())
index := f.labelToRegAllocBlockIndex[l]
return &f.reversePostOrderBlocks[index]
}
// Idom implements regalloc.Function Idom.
func (f *RegAllocFunction[I, M]) Idom(blk regalloc.Block) regalloc.Block {
builder := f.ssb
idom := builder.Idom(blk.(*RegAllocBlock[I, M]).sb)
if idom == nil {
panic("BUG: idom must not be nil")
}
l := f.m.SSABlockLabel(idom.ID())
index := f.labelToRegAllocBlockIndex[l]
return &f.reversePostOrderBlocks[index]
}
// ID implements regalloc.Block.
func (r *RegAllocBlock[I, m]) ID() int32 { return int32(r.id) }
// BlockParams implements regalloc.Block.
func (r *RegAllocBlock[I, m]) BlockParams(regs *[]regalloc.VReg) []regalloc.VReg {
c := r.f.c
*regs = (*regs)[:0]
for i := 0; i < r.sb.Params(); i++ {
v := c.VRegOf(r.sb.Param(i))
*regs = append(*regs, v)
}
return *regs
}
// InstrIteratorBegin implements regalloc.Block.
func (r *RegAllocBlock[I, m]) InstrIteratorBegin() regalloc.Instr {
r.cur = r.begin
return r.cur
}
// InstrIteratorNext implements regalloc.Block.
func (r *RegAllocBlock[I, m]) InstrIteratorNext() regalloc.Instr {
for {
if r.cur == r.end {
return nil
}
instr := r.cur.Next()
r.cur = instr.(I)
if instr == nil {
return nil
} else if instr.AddedBeforeRegAlloc() {
// Only concerned about the instruction added before regalloc.
return instr
}
}
}
// InstrRevIteratorBegin implements regalloc.Block.
func (r *RegAllocBlock[I, m]) InstrRevIteratorBegin() regalloc.Instr {
r.cur = r.end
return r.cur
}
// InstrRevIteratorNext implements regalloc.Block.
func (r *RegAllocBlock[I, m]) InstrRevIteratorNext() regalloc.Instr {
for {
if r.cur == r.begin {
return nil
}
instr := r.cur.Prev()
r.cur = instr.(I)
if instr == nil {
return nil
} else if instr.AddedBeforeRegAlloc() {
// Only concerned about the instruction added before regalloc.
return instr
}
}
}
// FirstInstr implements regalloc.Block.
func (r *RegAllocBlock[I, m]) FirstInstr() regalloc.Instr {
return r.begin
}
// EndInstr implements regalloc.Block.
func (r *RegAllocBlock[I, m]) EndInstr() regalloc.Instr {
return r.end
}
// LastInstrForInsertion implements regalloc.Block.
func (r *RegAllocBlock[I, m]) LastInstrForInsertion() regalloc.Instr {
var nil I
if r.cachedLastInstrForInsertion == nil {
r.cachedLastInstrForInsertion = r.f.m.LastInstrForInsertion(r.begin, r.end)
}
return r.cachedLastInstrForInsertion
}
// Preds implements regalloc.Block.
func (r *RegAllocBlock[I, m]) Preds() int { return r.sb.Preds() }
// Pred implements regalloc.Block.
func (r *RegAllocBlock[I, m]) Pred(i int) regalloc.Block {
sb := r.sb
pred := sb.Pred(i)
l := r.f.m.SSABlockLabel(pred.ID())
index := r.f.labelToRegAllocBlockIndex[l]
return &r.f.reversePostOrderBlocks[index]
}
// Entry implements regalloc.Block.
func (r *RegAllocBlock[I, m]) Entry() bool { return r.sb.EntryBlock() }
// Succs implements regalloc.Block.
func (r *RegAllocBlock[I, m]) Succs() int {
return r.sb.Succs()
}
// Succ implements regalloc.Block.
func (r *RegAllocBlock[I, m]) Succ(i int) regalloc.Block {
sb := r.sb
succ := sb.Succ(i)
if succ.ReturnBlock() {
return nil
}
l := r.f.m.SSABlockLabel(succ.ID())
index := r.f.labelToRegAllocBlockIndex[l]
return &r.f.reversePostOrderBlocks[index]
}
// LoopHeader implements regalloc.Block.
func (r *RegAllocBlock[I, m]) LoopHeader() bool {
return r.sb.LoopHeader()
}
// LoopNestingForestChildren implements regalloc.Block.
func (r *RegAllocBlock[I, m]) LoopNestingForestChildren() int {
r.loopNestingForestChildren = r.sb.LoopNestingForestChildren()
return len(r.loopNestingForestChildren)
}
// LoopNestingForestChild implements regalloc.Block.
func (r *RegAllocBlock[I, m]) LoopNestingForestChild(i int) regalloc.Block {
blk := r.loopNestingForestChildren[i]
l := r.f.m.SSABlockLabel(blk.ID())
index := r.f.labelToRegAllocBlockIndex[l]
return &r.f.reversePostOrderBlocks[index]
}

View File

@ -0,0 +1,136 @@
package regalloc
import "fmt"
// These interfaces are implemented by ISA-specific backends to abstract away the details, and allow the register
// allocators to work on any ISA.
//
// TODO: the interfaces are not stabilized yet, especially x64 will need some changes. E.g. x64 has an addressing mode
// where index can be in memory. That kind of info will be useful to reduce the register pressure, and should be leveraged
// by the register allocators, like https://docs.rs/regalloc2/latest/regalloc2/enum.OperandConstraint.html
type (
// Function is the top-level interface to do register allocation, which corresponds to a CFG containing
// Blocks(s).
Function interface {
// PostOrderBlockIteratorBegin returns the first block in the post-order traversal of the CFG.
// In other words, the last blocks in the CFG will be returned first.
PostOrderBlockIteratorBegin() Block
// PostOrderBlockIteratorNext returns the next block in the post-order traversal of the CFG.
PostOrderBlockIteratorNext() Block
// ReversePostOrderBlockIteratorBegin returns the first block in the reverse post-order traversal of the CFG.
// In other words, the first blocks in the CFG will be returned first.
ReversePostOrderBlockIteratorBegin() Block
// ReversePostOrderBlockIteratorNext returns the next block in the reverse post-order traversal of the CFG.
ReversePostOrderBlockIteratorNext() Block
// ClobberedRegisters tell the clobbered registers by this function.
ClobberedRegisters([]VReg)
// LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function.
LoopNestingForestRoots() int
// LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function.
LoopNestingForestRoot(i int) Block
// LowestCommonAncestor returns the lowest common ancestor of two blocks in the dominator tree.
LowestCommonAncestor(blk1, blk2 Block) Block
// Idom returns the immediate dominator of the given block.
Idom(blk Block) Block
// Followings are for rewriting the function.
// SwapAtEndOfBlock swaps the two virtual registers at the end of the given block.
SwapBefore(x1, x2, tmp VReg, instr Instr)
// StoreRegisterBefore inserts store instruction(s) before the given instruction for the given virtual register.
StoreRegisterBefore(v VReg, instr Instr)
// StoreRegisterAfter inserts store instruction(s) after the given instruction for the given virtual register.
StoreRegisterAfter(v VReg, instr Instr)
// ReloadRegisterBefore inserts reload instruction(s) before the given instruction for the given virtual register.
ReloadRegisterBefore(v VReg, instr Instr)
// ReloadRegisterAfter inserts reload instruction(s) after the given instruction for the given virtual register.
ReloadRegisterAfter(v VReg, instr Instr)
// InsertMoveBefore inserts move instruction(s) before the given instruction for the given virtual registers.
InsertMoveBefore(dst, src VReg, instr Instr)
}
// Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s).
Block interface {
// ID returns the unique identifier of this block which is ordered in the reverse post-order traversal of the CFG.
ID() int32
// BlockParams returns the virtual registers used as the parameters of this block.
BlockParams(*[]VReg) []VReg
// InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped.
// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
InstrIteratorBegin() Instr
// InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped.
// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
InstrIteratorNext() Instr
// InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order.
InstrRevIteratorBegin() Instr
// InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order.
InstrRevIteratorNext() Instr
// FirstInstr returns the fist instruction in this block where instructions will be inserted after it.
FirstInstr() Instr
// EndInstr returns the end instruction in this block.
EndInstr() Instr
// LastInstrForInsertion returns the last instruction in this block where instructions will be inserted before it.
// Such insertions only happen when we need to insert spill/reload instructions to adjust the merge edges.
// At the time of register allocation, all the critical edges are already split, so there is no need
// to worry about the case where branching instruction has multiple successors.
// Therefore, usually, it is the nop instruction, but if the block ends with an unconditional branching, then it returns
// the unconditional branch, not the nop. In other words it is either nop or unconditional branch.
LastInstrForInsertion() Instr
// Preds returns the number of predecessors of this block in the CFG.
Preds() int
// Pred returns the i-th predecessor of this block in the CFG.
Pred(i int) Block
// Entry returns true if the block is for the entry block.
Entry() bool
// Succs returns the number of successors of this block in the CFG.
Succs() int
// Succ returns the i-th successor of this block in the CFG.
Succ(i int) Block
// LoopHeader returns true if this block is a loop header.
LoopHeader() bool
// LoopNestingForestChildren returns the number of children of this block in the loop nesting forest.
LoopNestingForestChildren() int
// LoopNestingForestChild returns the i-th child of this block in the loop nesting forest.
LoopNestingForestChild(i int) Block
}
// Instr is an instruction in a block, abstracting away the underlying ISA.
Instr interface {
fmt.Stringer
// Next returns the next instruction in the same block.
Next() Instr
// Prev returns the previous instruction in the same block.
Prev() Instr
// Defs returns the virtual registers defined by this instruction.
Defs(*[]VReg) []VReg
// Uses returns the virtual registers used by this instruction.
// Note: multiple returned []VReg will not be held at the same time, so it's safe to use the same slice for this.
Uses(*[]VReg) []VReg
// AssignUse assigns the RealReg-allocated virtual register used by this instruction at the given index.
AssignUse(index int, v VReg)
// AssignDef assigns a RealReg-allocated virtual register defined by this instruction.
// This only accepts one register because we don't allocate registers for multi-def instructions (i.e. call instruction)
AssignDef(VReg)
// IsCopy returns true if this instruction is a move instruction between two registers.
// If true, the instruction is of the form of dst = src, and if the src and dst do not interfere with each other,
// we could coalesce them, and hence the copy can be eliminated from the final code.
IsCopy() bool
// IsCall returns true if this instruction is a call instruction. The result is used to insert
// caller saved register spills and restores.
IsCall() bool
// IsIndirectCall returns true if this instruction is an indirect call instruction which calls a function pointer.
// The result is used to insert caller saved register spills and restores.
IsIndirectCall() bool
// IsReturn returns true if this instruction is a return instruction.
IsReturn() bool
// AddedBeforeRegAlloc returns true if this instruction is added before register allocation.
AddedBeforeRegAlloc() bool
}
// InstrConstraint is an interface for arch-specific instruction constraints.
InstrConstraint interface {
comparable
Instr
}
)

View File

@ -0,0 +1,123 @@
package regalloc
import (
"fmt"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// VReg represents a register which is assigned to an SSA value. This is used to represent a register in the backend.
// A VReg may or may not be a physical register, and the info of physical register can be obtained by RealReg.
type VReg uint64
// VRegID is the lower 32bit of VReg, which is the pure identifier of VReg without RealReg info.
type VRegID uint32
// RealReg returns the RealReg of this VReg.
func (v VReg) RealReg() RealReg {
return RealReg(v >> 32)
}
// IsRealReg returns true if this VReg is backed by a physical register.
func (v VReg) IsRealReg() bool {
return v.RealReg() != RealRegInvalid
}
// FromRealReg returns a VReg from the given RealReg and RegType.
// This is used to represent a specific pre-colored register in the backend.
func FromRealReg(r RealReg, typ RegType) VReg {
rid := VRegID(r)
if rid > vRegIDReservedForRealNum {
panic(fmt.Sprintf("invalid real reg %d", r))
}
return VReg(r).SetRealReg(r).SetRegType(typ)
}
// SetRealReg sets the RealReg of this VReg and returns the updated VReg.
func (v VReg) SetRealReg(r RealReg) VReg {
return VReg(r)<<32 | (v & 0xff_00_ffffffff)
}
// RegType returns the RegType of this VReg.
func (v VReg) RegType() RegType {
return RegType(v >> 40)
}
// SetRegType sets the RegType of this VReg and returns the updated VReg.
func (v VReg) SetRegType(t RegType) VReg {
return VReg(t)<<40 | (v & 0x00_ff_ffffffff)
}
// ID returns the VRegID of this VReg.
func (v VReg) ID() VRegID {
return VRegID(v & 0xffffffff)
}
// Valid returns true if this VReg is Valid.
func (v VReg) Valid() bool {
return v.ID() != vRegIDInvalid && v.RegType() != RegTypeInvalid
}
// RealReg represents a physical register.
type RealReg byte
const RealRegInvalid RealReg = 0
const (
vRegIDInvalid VRegID = 1 << 31
VRegIDNonReservedBegin = vRegIDReservedForRealNum
vRegIDReservedForRealNum VRegID = 128
VRegInvalid = VReg(vRegIDInvalid)
)
// String implements fmt.Stringer.
func (r RealReg) String() string {
switch r {
case RealRegInvalid:
return "invalid"
default:
return fmt.Sprintf("r%d", r)
}
}
// String implements fmt.Stringer.
func (v VReg) String() string {
if v.IsRealReg() {
return fmt.Sprintf("r%d", v.ID())
}
return fmt.Sprintf("v%d?", v.ID())
}
// RegType represents the type of a register.
type RegType byte
const (
RegTypeInvalid RegType = iota
RegTypeInt
RegTypeFloat
NumRegType
)
// String implements fmt.Stringer.
func (r RegType) String() string {
switch r {
case RegTypeInt:
return "int"
case RegTypeFloat:
return "float"
default:
return "invalid"
}
}
// RegTypeOf returns the RegType of the given ssa.Type.
func RegTypeOf(p ssa.Type) RegType {
switch p {
case ssa.TypeI32, ssa.TypeI64:
return RegTypeInt
case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
return RegTypeFloat
default:
panic("invalid type")
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,108 @@
package regalloc
import (
"fmt"
"strings"
)
// NewRegSet returns a new RegSet with the given registers.
func NewRegSet(regs ...RealReg) RegSet {
var ret RegSet
for _, r := range regs {
ret = ret.add(r)
}
return ret
}
// RegSet represents a set of registers.
type RegSet uint64
func (rs RegSet) format(info *RegisterInfo) string { //nolint:unused
var ret []string
for i := 0; i < 64; i++ {
if rs&(1<<uint(i)) != 0 {
ret = append(ret, info.RealRegName(RealReg(i)))
}
}
return strings.Join(ret, ", ")
}
func (rs RegSet) has(r RealReg) bool {
return rs&(1<<uint(r)) != 0
}
func (rs RegSet) add(r RealReg) RegSet {
if r >= 64 {
return rs
}
return rs | 1<<uint(r)
}
func (rs RegSet) Range(f func(allocatedRealReg RealReg)) {
for i := 0; i < 64; i++ {
if rs&(1<<uint(i)) != 0 {
f(RealReg(i))
}
}
}
type regInUseSet struct {
set RegSet
vrs [64]VReg
}
func (rs *regInUseSet) reset() {
rs.set = 0
for i := range rs.vrs {
rs.vrs[i] = VRegInvalid
}
}
func (rs *regInUseSet) format(info *RegisterInfo) string { //nolint:unused
var ret []string
for i := 0; i < 64; i++ {
if rs.set&(1<<uint(i)) != 0 {
vr := rs.vrs[i]
ret = append(ret, fmt.Sprintf("(%s->v%d)", info.RealRegName(RealReg(i)), vr.ID()))
}
}
return strings.Join(ret, ", ")
}
func (rs *regInUseSet) has(r RealReg) bool {
if r >= 64 {
return false
}
return rs.set&(1<<uint(r)) != 0
}
func (rs *regInUseSet) get(r RealReg) VReg {
if r >= 64 {
return VRegInvalid
}
return rs.vrs[r]
}
func (rs *regInUseSet) remove(r RealReg) {
if r >= 64 {
return
}
rs.set &= ^(1 << uint(r))
rs.vrs[r] = VRegInvalid
}
func (rs *regInUseSet) add(r RealReg, vr VReg) {
if r >= 64 {
return
}
rs.set |= 1 << uint(r)
rs.vrs[r] = vr
}
func (rs *regInUseSet) range_(f func(allocatedRealReg RealReg, vr VReg)) {
for i := 0; i < 64; i++ {
if rs.set&(1<<uint(i)) != 0 {
f(RealReg(i), rs.vrs[i])
}
}
}

View File

@ -0,0 +1,43 @@
package backend
import (
"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
)
// SSAValueDefinition represents a definition of an SSA value.
type SSAValueDefinition struct {
// BlockParamValue is valid if Instr == nil
BlockParamValue ssa.Value
// BlkParamVReg is valid if Instr == nil
BlkParamVReg regalloc.VReg
// Instr is not nil if this is a definition from an instruction.
Instr *ssa.Instruction
// N is the index of the return value in the instr's return values list.
N int
// RefCount is the number of references to the result.
RefCount int
}
func (d *SSAValueDefinition) IsFromInstr() bool {
return d.Instr != nil
}
func (d *SSAValueDefinition) IsFromBlockParam() bool {
return d.Instr == nil
}
func (d *SSAValueDefinition) SSAValue() ssa.Value {
if d.IsFromBlockParam() {
return d.BlockParamValue
} else {
r, rs := d.Instr.Returns()
if d.N == 0 {
return r
} else {
return rs[d.N-1]
}
}
}