304 lines
6.0 KiB
ArmAsm
304 lines
6.0 KiB
ArmAsm
|
// Copyright 2016 Tom Thorogood. All rights reserved.
|
||
|
// Use of this source code is governed by a
|
||
|
// Modified BSD License license that can be found in
|
||
|
// the LICENSE file.
|
||
|
//
|
||
|
// Copyright 2005-2016, Wojciech Muła. All rights reserved.
|
||
|
// Use of this source code is governed by a
|
||
|
// Simplified BSD License license that can be found in
|
||
|
// the LICENSE file.
|
||
|
//
|
||
|
// This file is auto-generated - do not modify
|
||
|
|
||
|
// +build amd64,!gccgo,!appengine
|
||
|
|
||
|
#include "textflag.h"
|
||
|
|
||
|
DATA decodeBase<>+0x00(SB)/8, $0x3030303030303030
|
||
|
DATA decodeBase<>+0x08(SB)/8, $0x3030303030303030
|
||
|
DATA decodeBase<>+0x10(SB)/8, $0x2727272727272727
|
||
|
DATA decodeBase<>+0x18(SB)/8, $0x2727272727272727
|
||
|
GLOBL decodeBase<>(SB),RODATA,$32
|
||
|
|
||
|
DATA decodeToLower<>+0x00(SB)/8, $0x2020202020202020
|
||
|
DATA decodeToLower<>+0x08(SB)/8, $0x2020202020202020
|
||
|
GLOBL decodeToLower<>(SB),RODATA,$16
|
||
|
|
||
|
DATA decodeHigh<>+0x00(SB)/8, $0x0e0c0a0806040200
|
||
|
DATA decodeHigh<>+0x08(SB)/8, $0xffffffffffffffff
|
||
|
GLOBL decodeHigh<>(SB),RODATA,$16
|
||
|
|
||
|
DATA decodeLow<>+0x00(SB)/8, $0x0f0d0b0907050301
|
||
|
DATA decodeLow<>+0x08(SB)/8, $0xffffffffffffffff
|
||
|
GLOBL decodeLow<>(SB),RODATA,$16
|
||
|
|
||
|
DATA decodeValid<>+0x00(SB)/8, $0xb0b0b0b0b0b0b0b0
|
||
|
DATA decodeValid<>+0x08(SB)/8, $0xb0b0b0b0b0b0b0b0
|
||
|
DATA decodeValid<>+0x10(SB)/8, $0xb9b9b9b9b9b9b9b9
|
||
|
DATA decodeValid<>+0x18(SB)/8, $0xb9b9b9b9b9b9b9b9
|
||
|
DATA decodeValid<>+0x20(SB)/8, $0xe1e1e1e1e1e1e1e1
|
||
|
DATA decodeValid<>+0x28(SB)/8, $0xe1e1e1e1e1e1e1e1
|
||
|
DATA decodeValid<>+0x30(SB)/8, $0xe6e6e6e6e6e6e6e6
|
||
|
DATA decodeValid<>+0x38(SB)/8, $0xe6e6e6e6e6e6e6e6
|
||
|
GLOBL decodeValid<>(SB),RODATA,$64
|
||
|
|
||
|
DATA decodeToSigned<>+0x00(SB)/8, $0x8080808080808080
|
||
|
DATA decodeToSigned<>+0x08(SB)/8, $0x8080808080808080
|
||
|
GLOBL decodeToSigned<>(SB),RODATA,$16
|
||
|
|
||
|
TEXT ·decodeAVX(SB),NOSPLIT,$0
|
||
|
MOVQ dst+0(FP), DI
|
||
|
MOVQ src+8(FP), SI
|
||
|
MOVQ len+16(FP), BX
|
||
|
MOVQ SI, R15
|
||
|
MOVOU decodeValid<>(SB), X14
|
||
|
MOVOU decodeValid<>+0x20(SB), X15
|
||
|
MOVW $65535, DX
|
||
|
CMPQ BX, $16
|
||
|
JB tail
|
||
|
bigloop:
|
||
|
MOVOU (SI), X0
|
||
|
VPXOR decodeToSigned<>(SB), X0, X1
|
||
|
POR decodeToLower<>(SB), X0
|
||
|
VPXOR decodeToSigned<>(SB), X0, X2
|
||
|
VPCMPGTB X1, X14, X3
|
||
|
PCMPGTB decodeValid<>+0x10(SB), X1
|
||
|
VPCMPGTB X2, X15, X4
|
||
|
PCMPGTB decodeValid<>+0x30(SB), X2
|
||
|
PAND X4, X1
|
||
|
POR X2, X3
|
||
|
POR X1, X3
|
||
|
PMOVMSKB X3, AX
|
||
|
TESTW AX, DX
|
||
|
JNZ invalid
|
||
|
PSUBB decodeBase<>(SB), X0
|
||
|
PANDN decodeBase<>+0x10(SB), X4
|
||
|
PSUBB X4, X0
|
||
|
VPSHUFB decodeLow<>(SB), X0, X3
|
||
|
PSHUFB decodeHigh<>(SB), X0
|
||
|
PSLLW $4, X0
|
||
|
POR X3, X0
|
||
|
MOVQ X0, (DI)
|
||
|
SUBQ $16, BX
|
||
|
JZ ret
|
||
|
ADDQ $16, SI
|
||
|
ADDQ $8, DI
|
||
|
CMPQ BX, $16
|
||
|
JAE bigloop
|
||
|
tail:
|
||
|
MOVQ $16, CX
|
||
|
SUBQ BX, CX
|
||
|
SHRW CX, DX
|
||
|
CMPQ BX, $4
|
||
|
JB tail_in_2
|
||
|
JE tail_in_4
|
||
|
CMPQ BX, $8
|
||
|
JB tail_in_6
|
||
|
JE tail_in_8
|
||
|
CMPQ BX, $12
|
||
|
JB tail_in_10
|
||
|
JE tail_in_12
|
||
|
tail_in_14:
|
||
|
PINSRW $6, 12(SI), X0
|
||
|
tail_in_12:
|
||
|
PINSRW $5, 10(SI), X0
|
||
|
tail_in_10:
|
||
|
PINSRW $4, 8(SI), X0
|
||
|
tail_in_8:
|
||
|
PINSRQ $0, (SI), X0
|
||
|
JMP tail_conv
|
||
|
tail_in_6:
|
||
|
PINSRW $2, 4(SI), X0
|
||
|
tail_in_4:
|
||
|
PINSRW $1, 2(SI), X0
|
||
|
tail_in_2:
|
||
|
PINSRW $0, (SI), X0
|
||
|
tail_conv:
|
||
|
VPXOR decodeToSigned<>(SB), X0, X1
|
||
|
POR decodeToLower<>(SB), X0
|
||
|
VPXOR decodeToSigned<>(SB), X0, X2
|
||
|
VPCMPGTB X1, X14, X3
|
||
|
PCMPGTB decodeValid<>+0x10(SB), X1
|
||
|
VPCMPGTB X2, X15, X4
|
||
|
PCMPGTB decodeValid<>+0x30(SB), X2
|
||
|
PAND X4, X1
|
||
|
POR X2, X3
|
||
|
POR X1, X3
|
||
|
PMOVMSKB X3, AX
|
||
|
TESTW AX, DX
|
||
|
JNZ invalid
|
||
|
PSUBB decodeBase<>(SB), X0
|
||
|
PANDN decodeBase<>+0x10(SB), X4
|
||
|
PSUBB X4, X0
|
||
|
VPSHUFB decodeLow<>(SB), X0, X3
|
||
|
PSHUFB decodeHigh<>(SB), X0
|
||
|
PSLLW $4, X0
|
||
|
POR X3, X0
|
||
|
CMPQ BX, $4
|
||
|
JB tail_out_2
|
||
|
JE tail_out_4
|
||
|
CMPQ BX, $8
|
||
|
JB tail_out_6
|
||
|
JE tail_out_8
|
||
|
CMPQ BX, $12
|
||
|
JB tail_out_10
|
||
|
JE tail_out_12
|
||
|
tail_out_14:
|
||
|
PEXTRB $6, X0, 6(DI)
|
||
|
tail_out_12:
|
||
|
PEXTRB $5, X0, 5(DI)
|
||
|
tail_out_10:
|
||
|
PEXTRB $4, X0, 4(DI)
|
||
|
tail_out_8:
|
||
|
MOVL X0, (DI)
|
||
|
JMP ret
|
||
|
tail_out_6:
|
||
|
PEXTRB $2, X0, 2(DI)
|
||
|
tail_out_4:
|
||
|
PEXTRB $1, X0, 1(DI)
|
||
|
tail_out_2:
|
||
|
PEXTRB $0, X0, (DI)
|
||
|
ret:
|
||
|
MOVB $1, ok+32(FP)
|
||
|
RET
|
||
|
invalid:
|
||
|
BSFW AX, AX
|
||
|
SUBQ R15, SI
|
||
|
ADDQ SI, AX
|
||
|
MOVQ AX, n+24(FP)
|
||
|
MOVB $0, ok+32(FP)
|
||
|
RET
|
||
|
|
||
|
TEXT ·decodeSSE(SB),NOSPLIT,$0
|
||
|
MOVQ dst+0(FP), DI
|
||
|
MOVQ src+8(FP), SI
|
||
|
MOVQ len+16(FP), BX
|
||
|
MOVQ SI, R15
|
||
|
MOVOU decodeValid<>(SB), X14
|
||
|
MOVOU decodeValid<>+0x20(SB), X15
|
||
|
MOVW $65535, DX
|
||
|
CMPQ BX, $16
|
||
|
JB tail
|
||
|
bigloop:
|
||
|
MOVOU (SI), X0
|
||
|
MOVOU X0, X1
|
||
|
PXOR decodeToSigned<>(SB), X1
|
||
|
POR decodeToLower<>(SB), X0
|
||
|
MOVOU X0, X2
|
||
|
PXOR decodeToSigned<>(SB), X2
|
||
|
MOVOU X14, X3
|
||
|
PCMPGTB X1, X3
|
||
|
PCMPGTB decodeValid<>+0x10(SB), X1
|
||
|
MOVOU X15, X4
|
||
|
PCMPGTB X2, X4
|
||
|
PCMPGTB decodeValid<>+0x30(SB), X2
|
||
|
PAND X4, X1
|
||
|
POR X2, X3
|
||
|
POR X1, X3
|
||
|
PMOVMSKB X3, AX
|
||
|
TESTW AX, DX
|
||
|
JNZ invalid
|
||
|
PSUBB decodeBase<>(SB), X0
|
||
|
PANDN decodeBase<>+0x10(SB), X4
|
||
|
PSUBB X4, X0
|
||
|
MOVOU X0, X3
|
||
|
PSHUFB decodeLow<>(SB), X3
|
||
|
PSHUFB decodeHigh<>(SB), X0
|
||
|
PSLLW $4, X0
|
||
|
POR X3, X0
|
||
|
MOVQ X0, (DI)
|
||
|
SUBQ $16, BX
|
||
|
JZ ret
|
||
|
ADDQ $16, SI
|
||
|
ADDQ $8, DI
|
||
|
CMPQ BX, $16
|
||
|
JAE bigloop
|
||
|
tail:
|
||
|
MOVQ $16, CX
|
||
|
SUBQ BX, CX
|
||
|
SHRW CX, DX
|
||
|
CMPQ BX, $4
|
||
|
JB tail_in_2
|
||
|
JE tail_in_4
|
||
|
CMPQ BX, $8
|
||
|
JB tail_in_6
|
||
|
JE tail_in_8
|
||
|
CMPQ BX, $12
|
||
|
JB tail_in_10
|
||
|
JE tail_in_12
|
||
|
tail_in_14:
|
||
|
PINSRW $6, 12(SI), X0
|
||
|
tail_in_12:
|
||
|
PINSRW $5, 10(SI), X0
|
||
|
tail_in_10:
|
||
|
PINSRW $4, 8(SI), X0
|
||
|
tail_in_8:
|
||
|
PINSRQ $0, (SI), X0
|
||
|
JMP tail_conv
|
||
|
tail_in_6:
|
||
|
PINSRW $2, 4(SI), X0
|
||
|
tail_in_4:
|
||
|
PINSRW $1, 2(SI), X0
|
||
|
tail_in_2:
|
||
|
PINSRW $0, (SI), X0
|
||
|
tail_conv:
|
||
|
MOVOU X0, X1
|
||
|
PXOR decodeToSigned<>(SB), X1
|
||
|
POR decodeToLower<>(SB), X0
|
||
|
MOVOU X0, X2
|
||
|
PXOR decodeToSigned<>(SB), X2
|
||
|
MOVOU X14, X3
|
||
|
PCMPGTB X1, X3
|
||
|
PCMPGTB decodeValid<>+0x10(SB), X1
|
||
|
MOVOU X15, X4
|
||
|
PCMPGTB X2, X4
|
||
|
PCMPGTB decodeValid<>+0x30(SB), X2
|
||
|
PAND X4, X1
|
||
|
POR X2, X3
|
||
|
POR X1, X3
|
||
|
PMOVMSKB X3, AX
|
||
|
TESTW AX, DX
|
||
|
JNZ invalid
|
||
|
PSUBB decodeBase<>(SB), X0
|
||
|
PANDN decodeBase<>+0x10(SB), X4
|
||
|
PSUBB X4, X0
|
||
|
MOVOU X0, X3
|
||
|
PSHUFB decodeLow<>(SB), X3
|
||
|
PSHUFB decodeHigh<>(SB), X0
|
||
|
PSLLW $4, X0
|
||
|
POR X3, X0
|
||
|
CMPQ BX, $4
|
||
|
JB tail_out_2
|
||
|
JE tail_out_4
|
||
|
CMPQ BX, $8
|
||
|
JB tail_out_6
|
||
|
JE tail_out_8
|
||
|
CMPQ BX, $12
|
||
|
JB tail_out_10
|
||
|
JE tail_out_12
|
||
|
tail_out_14:
|
||
|
PEXTRB $6, X0, 6(DI)
|
||
|
tail_out_12:
|
||
|
PEXTRB $5, X0, 5(DI)
|
||
|
tail_out_10:
|
||
|
PEXTRB $4, X0, 4(DI)
|
||
|
tail_out_8:
|
||
|
MOVL X0, (DI)
|
||
|
JMP ret
|
||
|
tail_out_6:
|
||
|
PEXTRB $2, X0, 2(DI)
|
||
|
tail_out_4:
|
||
|
PEXTRB $1, X0, 1(DI)
|
||
|
tail_out_2:
|
||
|
PEXTRB $0, X0, (DI)
|
||
|
ret:
|
||
|
MOVB $1, ok+32(FP)
|
||
|
RET
|
||
|
invalid:
|
||
|
BSFW AX, AX
|
||
|
SUBQ R15, SI
|
||
|
ADDQ SI, AX
|
||
|
MOVQ AX, n+24(FP)
|
||
|
MOVB $0, ok+32(FP)
|
||
|
RET
|