mirror of
https://github.com/Cateners/tiny_computer.git
synced 2026-05-21 00:45:49 +08:00
Update code to v1.0.14 (10)
This commit is contained in:
761
android/extern/libjpeg-turbo/simd/i386/jchuff-sse2.asm
vendored
Normal file
761
android/extern/libjpeg-turbo/simd/i386/jchuff-sse2.asm
vendored
Normal file
@@ -0,0 +1,761 @@
|
||||
;
|
||||
; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
|
||||
;
|
||||
; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
|
||||
; Copyright (C) 2015, Matthieu Darbois.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains an SSE2 implementation for Huffman coding of one block.
|
||||
; The following code is based on jchuff.c; see jchuff.c for more details.
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
struc working_state
|
||||
.next_output_byte: resp 1 ; => next byte to write in buffer
|
||||
.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
|
||||
.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
|
||||
.cur.free_bits resd 1 ; # of bits available in it
|
||||
.cur.last_dc_val resd 4 ; last DC coef for each component
|
||||
.cinfo: resp 1 ; dump_buffer needs access to this
|
||||
endstruc
|
||||
|
||||
struc c_derived_tbl
|
||||
.ehufco: resd 256 ; code for each symbol
|
||||
.ehufsi: resb 256 ; length of code for each symbol
|
||||
; If no code has been allocated for a symbol S, ehufsi[S] contains 0
|
||||
endstruc
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
GLOBAL_DATA(jconst_huff_encode_one_block)
|
||||
|
||||
EXTN(jconst_huff_encode_one_block):
|
||||
|
||||
alignz 32
|
||||
|
||||
jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
|
||||
dq 0x000f, 0x001f, 0x003f, 0x007f
|
||||
dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
|
||||
dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
|
||||
|
||||
times 1 << 14 db 15
|
||||
times 1 << 13 db 14
|
||||
times 1 << 12 db 13
|
||||
times 1 << 11 db 12
|
||||
times 1 << 10 db 11
|
||||
times 1 << 9 db 10
|
||||
times 1 << 8 db 9
|
||||
times 1 << 7 db 8
|
||||
times 1 << 6 db 7
|
||||
times 1 << 5 db 6
|
||||
times 1 << 4 db 5
|
||||
times 1 << 3 db 4
|
||||
times 1 << 2 db 3
|
||||
times 1 << 1 db 2
|
||||
times 1 << 0 db 1
|
||||
times 1 db 0
|
||||
jpeg_nbits_table:
|
||||
times 1 db 0
|
||||
times 1 << 0 db 1
|
||||
times 1 << 1 db 2
|
||||
times 1 << 2 db 3
|
||||
times 1 << 3 db 4
|
||||
times 1 << 4 db 5
|
||||
times 1 << 5 db 6
|
||||
times 1 << 6 db 7
|
||||
times 1 << 7 db 8
|
||||
times 1 << 8 db 9
|
||||
times 1 << 9 db 10
|
||||
times 1 << 10 db 11
|
||||
times 1 << 11 db 12
|
||||
times 1 << 12 db 13
|
||||
times 1 << 13 db 14
|
||||
times 1 << 14 db 15
|
||||
|
||||
alignz 32
|
||||
|
||||
%ifdef PIC
|
||||
%define NBITS(x) nbits_base + x
|
||||
%else
|
||||
%define NBITS(x) jpeg_nbits_table + x
|
||||
%endif
|
||||
%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
%define mm_put_buffer mm0
|
||||
%define mm_all_0xff mm1
|
||||
%define mm_temp mm2
|
||||
%define mm_nbits mm3
|
||||
%define mm_code_bits mm3
|
||||
%define mm_code mm4
|
||||
%define mm_overflow_bits mm5
|
||||
%define mm_save_nbits mm6
|
||||
|
||||
; Shorthand used to describe SIMD operations:
|
||||
; wN: xmmN treated as eight signed 16-bit values
|
||||
; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
|
||||
; bN: xmmN treated as 16 unsigned 8-bit values, or
|
||||
; mmN treated as eight unsigned 8-bit values
|
||||
; bN[i]: perform the same operation on all unsigned 8-bit values,
|
||||
; i=0..15 (SSE register) or i=0..7 (MMX register)
|
||||
; Contents of SIMD registers are shown in memory order.
|
||||
|
||||
; Fill the bit buffer to capacity with the leading bits from code, then output
|
||||
; the bit buffer and put the remaining bits from code into the bit buffer.
|
||||
;
|
||||
; Usage:
|
||||
; code - contains the bits to shift into the bit buffer (LSB-aligned)
|
||||
; %1 - temp register
|
||||
; %2 - low byte of temp register
|
||||
; %3 - second byte of temp register
|
||||
; %4-%8 (optional) - extra instructions to execute before the macro completes
|
||||
; %9 - the label to which to jump when the macro completes
|
||||
;
|
||||
; Upon completion, free_bits will be set to the number of remaining bits from
|
||||
; code, and put_buffer will contain those remaining bits. temp and code will
|
||||
; be clobbered.
|
||||
;
|
||||
; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
|
||||
; macro in jchuff.c.
|
||||
|
||||
%macro EMIT_QWORD 9
|
||||
%define %%temp %1
|
||||
%define %%tempb %2
|
||||
%define %%temph %3
|
||||
add nbits, free_bits ; nbits += free_bits;
|
||||
neg free_bits ; free_bits = -free_bits;
|
||||
movq mm_temp, mm_code ; temp = code;
|
||||
movd mm_nbits, nbits ; nbits --> MMX register
|
||||
movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits;
|
||||
neg free_bits ; free_bits = -free_bits;
|
||||
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
|
||||
psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits;
|
||||
add free_bits, 64 ; free_bits += 64;
|
||||
por mm_temp, mm_put_buffer ; temp |= put_buffer;
|
||||
%ifidn %%temp, nbits_base
|
||||
movd mm_save_nbits, nbits_base ; save nbits_base
|
||||
%endif
|
||||
movq mm_code_bits, mm_temp ; code_bits (temp register) = temp;
|
||||
movq mm_put_buffer, mm_code ; put_buffer = code;
|
||||
pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
|
||||
movq mm_code, mm_code_bits ; code = code_bits;
|
||||
psrlq mm_code_bits, 32 ; code_bits >>= 32;
|
||||
pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i);
|
||||
movd %%temp, mm_code_bits ; temp = code_bits;
|
||||
bswap %%temp ; temp = htonl(temp);
|
||||
test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */
|
||||
jnz %%.SLOW ; goto %%.SLOW
|
||||
mov dword [buffer], %%temp ; *(uint32_t)buffer = temp;
|
||||
%ifidn %%temp, nbits_base
|
||||
movd nbits_base, mm_save_nbits ; restore nbits_base
|
||||
%endif
|
||||
%4
|
||||
movd nbits, mm_code ; nbits = (uint32_t)(code);
|
||||
%5
|
||||
bswap nbits ; nbits = htonl(nbits);
|
||||
mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits;
|
||||
lea buffer, [buffer + 8] ; buffer += 8;
|
||||
%6
|
||||
%7
|
||||
%8
|
||||
jmp %9 ; return
|
||||
%%.SLOW:
|
||||
; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
|
||||
; bytes in the qword.
|
||||
mov byte [buffer], %%tempb ; buffer[0] = temp[0];
|
||||
cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
|
||||
mov byte [buffer], %%temph ; buffer[0] = temp[1];
|
||||
cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
||||
shr %%temp, 16 ; temp >>= 16;
|
||||
mov byte [buffer], %%tempb ; buffer[0] = temp[0];
|
||||
cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
|
||||
mov byte [buffer], %%temph ; buffer[0] = temp[1];
|
||||
cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
|
||||
movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code)
|
||||
%ifidn %%temp, nbits_base
|
||||
movd nbits_base, mm_save_nbits ; restore nbits_base
|
||||
%endif
|
||||
bswap nbits ; nbits = htonl(nbits)
|
||||
mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
|
||||
cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
|
||||
mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
|
||||
cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
|
||||
shr nbits, 16 ; nbits >>= 16;
|
||||
mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
|
||||
cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
|
||||
mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
|
||||
%4
|
||||
cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
|
||||
mov byte [buffer+1], 0 ; buffer[1] = 0;
|
||||
sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
|
||||
%5
|
||||
%6
|
||||
%7
|
||||
%8
|
||||
jmp %9 ; return;
|
||||
%endmacro
|
||||
|
||||
%macro PUSH 1
|
||||
push %1
|
||||
%assign stack_offset stack_offset + 4
|
||||
%endmacro
|
||||
|
||||
%macro POP 1
|
||||
pop %1
|
||||
%assign stack_offset stack_offset - 4
|
||||
%endmacro
|
||||
|
||||
; If PIC is defined, load the address of a symbol defined in this file into a
|
||||
; register. Equivalent to
|
||||
; get_GOT %1
|
||||
; lea %1, [GOTOFF(%1, %2)]
|
||||
; without using the GOT.
|
||||
;
|
||||
; Usage:
|
||||
; %1 - register into which to load the address of the symbol
|
||||
; %2 - symbol whose address should be loaded
|
||||
; %3 - optional multi-line macro to execute before the symbol address is loaded
|
||||
; %4 - optional multi-line macro to execute after the symbol address is loaded
|
||||
;
|
||||
; If PIC is not defined, then %3 and %4 are executed in order.
|
||||
|
||||
%macro GET_SYM 2-4
|
||||
%ifdef PIC
|
||||
call %%.geteip
|
||||
%%.ref:
|
||||
%4
|
||||
add %1, %2 - %%.ref
|
||||
jmp short %%.done
|
||||
align 32
|
||||
%%.geteip:
|
||||
%3 4 ; must adjust stack pointer because of call
|
||||
mov %1, POINTER [esp]
|
||||
ret
|
||||
align 32
|
||||
%%.done:
|
||||
%else
|
||||
%3 0
|
||||
%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Encode a single block's worth of coefficients.
|
||||
;
|
||||
; GLOBAL(JOCTET *)
|
||||
; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
|
||||
; JCOEFPTR block, int last_dc_val,
|
||||
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
|
||||
;
|
||||
; Stack layout:
|
||||
; Function args
|
||||
; Return address
|
||||
; Saved ebx
|
||||
; Saved ebp
|
||||
; Saved esi
|
||||
; Saved edi <-- esp_save
|
||||
; ...
|
||||
; esp_save
|
||||
; t_ 64*2 bytes (aligned to 128 bytes)
|
||||
;
|
||||
; esp is used (as t) to point into t_ (data in lower indices is not used once
|
||||
; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows
|
||||
; us to find the rest of the data again.
|
||||
;
|
||||
; NOTES:
|
||||
; When shuffling data, we try to avoid pinsrw as much as possible, since it is
|
||||
; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
|
||||
; modern CPUs, so chains of pinsrw instructions (even with different outputs)
|
||||
; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
|
||||
; requires 2 µops (with memory operand) on Intel. In either case, only one
|
||||
; pinsrw instruction can be decoded per cycle (and nothing else if they are
|
||||
; back-to-back), so out-of-order execution cannot be used to work around long
|
||||
; pinsrw chains (though for Sandy Bridge and later, this may be less of a
|
||||
; problem if the code runs from the µop cache.)
|
||||
;
|
||||
; We use tzcnt instead of bsf without checking for support. The instruction is
|
||||
; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
|
||||
; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
|
||||
; an input dependency (although the behavior is not formally defined, Intel
|
||||
; CPUs usually leave the destination unmodified if the source is zero.) This
|
||||
; can prevent out-of-order execution, so we clear the destination before
|
||||
; invoking tzcnt.
|
||||
;
|
||||
; Initial register allocation
|
||||
; eax - frame --> buffer
|
||||
; ebx - nbits_base (PIC) / emit_temp
|
||||
; ecx - dctbl --> size --> state
|
||||
; edx - block --> nbits
|
||||
; esi - code_temp --> state --> actbl
|
||||
; edi - index_temp --> free_bits
|
||||
; esp - t
|
||||
; ebp - index
|
||||
|
||||
%define frame eax
|
||||
%ifdef PIC
|
||||
%define nbits_base ebx
|
||||
%endif
|
||||
%define emit_temp ebx
|
||||
%define emit_tempb bl
|
||||
%define emit_temph bh
|
||||
%define dctbl ecx
|
||||
%define block edx
|
||||
%define code_temp esi
|
||||
%define index_temp edi
|
||||
%define t esp
|
||||
%define index ebp
|
||||
|
||||
%assign save_frame DCTSIZE2 * SIZEOF_WORD
|
||||
|
||||
; Step 1: Re-arrange input data according to jpeg_natural_order
|
||||
; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
|
||||
; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
|
||||
; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
|
||||
; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
|
||||
; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
|
||||
; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
|
||||
; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
|
||||
; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
|
||||
|
||||
EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
|
||||
%assign stack_offset 0
|
||||
%define arg_state 4 + stack_offset
|
||||
%define arg_buffer 8 + stack_offset
|
||||
%define arg_block 12 + stack_offset
|
||||
%define arg_last_dc_val 16 + stack_offset
|
||||
%define arg_dctbl 20 + stack_offset
|
||||
%define arg_actbl 24 + stack_offset
|
||||
|
||||
;X: X = code stream
|
||||
mov block, [esp + arg_block]
|
||||
PUSH ebx
|
||||
PUSH ebp
|
||||
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
|
||||
PUSH esi
|
||||
PUSH edi
|
||||
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
|
||||
mov frame, esp
|
||||
lea t, [frame - (save_frame + 4)]
|
||||
movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
|
||||
and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0]
|
||||
mov [t + save_frame], frame
|
||||
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
||||
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
|
||||
pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
|
||||
pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
|
||||
punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
|
||||
punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
|
||||
pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
|
||||
;A: (Row 0, offset 1)
|
||||
pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
|
||||
paddw xmm0, xmm4 ;A: w0[i] += w4[i];
|
||||
movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
|
||||
|
||||
movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
|
||||
pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
|
||||
pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
|
||||
movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
|
||||
movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
|
||||
punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
|
||||
pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
|
||||
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
||||
psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
|
||||
pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
|
||||
pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
|
||||
; (Row 1, offset 1)
|
||||
pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
|
||||
paddw xmm1, xmm4 ;B: w1[i] += w4[i];
|
||||
movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
|
||||
pxor xmm4, xmm4 ;B: w4[i] = 0;
|
||||
pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
|
||||
|
||||
packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
|
||||
; w/ signed saturation
|
||||
|
||||
pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
|
||||
pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
|
||||
pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
|
||||
pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
|
||||
; (Row 3, offset 1)
|
||||
pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
|
||||
paddw xmm3, xmm4 ;D: w3[i] += w4[i];
|
||||
movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
|
||||
pxor xmm4, xmm4 ;D: w4[i] = 0;
|
||||
pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
|
||||
|
||||
pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
|
||||
pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
|
||||
pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
|
||||
pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
|
||||
pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
|
||||
pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
|
||||
; (Row 2, offset 1)
|
||||
pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
|
||||
paddw xmm2, xmm4 ;C: w2[i] += w4[i];
|
||||
movsx code_temp, word [block] ;Z: code_temp = block[0];
|
||||
|
||||
; %1 - stack pointer adjustment
|
||||
%macro GET_SYM_BEFORE 1
|
||||
movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
|
||||
;C: t[i+16] = w2[i];
|
||||
pxor xmm4, xmm4 ;C: w4[i] = 0;
|
||||
pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
|
||||
sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val;
|
||||
|
||||
packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
|
||||
; w/ signed saturation
|
||||
|
||||
movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
|
||||
pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i);
|
||||
pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i);
|
||||
movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
|
||||
punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
|
||||
shl index_temp, 16 ;Z: index_temp <<= 16;
|
||||
psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
|
||||
pxor xmm2, xmm2 ;H: w2[i] = 0;
|
||||
pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
|
||||
or index, index_temp ;Z: index |= index_temp;
|
||||
%undef index_temp
|
||||
%define free_bits edi
|
||||
%endmacro
|
||||
|
||||
%macro GET_SYM_AFTER 0
|
||||
movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
|
||||
unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
|
||||
pxor xmm0, xmm0 ;H: w0[i] = 0;
|
||||
not index ;Z: index = ~index;
|
||||
pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
|
||||
; (Row 7, offset 1)
|
||||
pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
|
||||
mov dctbl, [frame + arg_dctbl]
|
||||
paddw xmm3, xmm2 ;H: w3[i] += w2[i];
|
||||
movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
|
||||
movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
|
||||
pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
|
||||
punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
|
||||
movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
|
||||
pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF;
|
||||
%endmacro
|
||||
|
||||
GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
|
||||
|
||||
psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
|
||||
shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
|
||||
pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
|
||||
pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
|
||||
pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
|
||||
pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
|
||||
cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000,
|
||||
;Z: i.e. if code_temp is positive
|
||||
pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
|
||||
movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
|
||||
pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
|
||||
pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
|
||||
pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
|
||||
; (Row 6, offset 1)
|
||||
adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0);
|
||||
pxor xmm2, xmm2 ;G: w2[i] = 0;
|
||||
pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
|
||||
pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
|
||||
paddw xmm4, xmm0 ;G: w4[i] += w0[i];
|
||||
movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
|
||||
movd mm_temp, code_temp ;Z: temp = code_temp
|
||||
pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
|
||||
; (Row 5, offset 1)
|
||||
pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
|
||||
|
||||
packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
|
||||
; w/ signed saturation
|
||||
|
||||
lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1]
|
||||
pxor xmm0, xmm0 ;F: w0[i] = 0;
|
||||
pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
|
||||
paddw xmm1, xmm2 ;F: w1[i] += w2[i];
|
||||
movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
|
||||
pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
|
||||
pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
|
||||
pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
|
||||
pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
|
||||
pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
|
||||
; (Row 4, offset 1)
|
||||
%undef block
|
||||
%define nbits edx
|
||||
%define nbitsb dl
|
||||
%define nbitsh dh
|
||||
movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp);
|
||||
%undef code_temp
|
||||
%define state esi
|
||||
pxor xmm2, xmm2 ;E: w2[i] = 0;
|
||||
mov state, [frame + arg_state]
|
||||
movd mm_nbits, nbits ;Z: nbits --> MMX register
|
||||
pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
|
||||
movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
|
||||
;Z: code = dctbl->ehufco[nbits];
|
||||
%define size ecx
|
||||
%define sizeb cl
|
||||
%define sizeh ch
|
||||
paddw xmm5, xmm0 ;E: w5[i] += w0[i];
|
||||
movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
|
||||
movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
|
||||
;Z: size = dctbl->ehufsi[nbits];
|
||||
%undef dctbl
|
||||
pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
|
||||
|
||||
packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
|
||||
; w/ signed saturation
|
||||
|
||||
movq mm_put_buffer, [state + working_state.cur.put_buffer.simd]
|
||||
;Z: put_buffer = state->cur.put_buffer.simd;
|
||||
mov free_bits, [state + working_state.cur.free_bits]
|
||||
;Z: free_bits = state->cur.free_bits;
|
||||
%undef state
|
||||
%define actbl esi
|
||||
mov actbl, [frame + arg_actbl]
|
||||
%define buffer eax
|
||||
mov buffer, [frame + arg_buffer]
|
||||
%undef frame
|
||||
jmp .BEGIN
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
; size <= 32, so this is not really a loop
|
||||
.BRLOOP1: ; .BRLOOP1:
|
||||
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
|
||||
; nbits = actbl->ehufsi[0xf0];
|
||||
movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
|
||||
; code = actbl->ehufco[0xf0];
|
||||
and index, 0x7ffffff ; clear index if size == 32
|
||||
sub size, 16 ; size -= 16;
|
||||
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
|
||||
jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1;
|
||||
movd mm_nbits, nbits ; nbits --> MMX register
|
||||
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
|
||||
por mm_put_buffer, mm_code ; put_buffer |= code;
|
||||
jmp .ERLOOP1 ; goto .ERLOOP1;
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
%ifdef PIC
|
||||
times 6 nop
|
||||
%else
|
||||
times 2 nop
|
||||
%endif
|
||||
.BLOOP1: ; do { /* size = # of zero bits/elements to skip */
|
||||
; if size == 32, index remains unchanged. Correct in .BRLOOP.
|
||||
shr index, sizeb ; index >>= size;
|
||||
lea t, [t + size * SIZEOF_WORD] ; t += size;
|
||||
cmp size, 16 ; if (size > 16)
|
||||
jg .BRLOOP1 ; goto .BRLOOP1;
|
||||
.ERLOOP1: ; .ERLOOP1:
|
||||
movsx nbits, word [t] ; nbits = *t;
|
||||
%ifdef PIC
|
||||
add size, size ; size += size;
|
||||
%else
|
||||
lea size, [size * 2] ; size += size;
|
||||
%endif
|
||||
movd mm_temp, nbits ; temp = nbits;
|
||||
movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
|
||||
lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
|
||||
movd mm_nbits, nbits ; nbits --> MMX register
|
||||
movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
|
||||
; code = actbl->ehufco[size-16];
|
||||
movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
|
||||
; size = actbl->ehufsi[size-16];
|
||||
.BEGIN: ; .BEGIN:
|
||||
pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
|
||||
psllq mm_code, mm_nbits ; code <<= nbits;
|
||||
add nbits, size ; nbits += size;
|
||||
por mm_code, mm_temp ; code |= temp;
|
||||
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
|
||||
jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1
|
||||
xor size, size ; size = 0; /* kill tzcnt input dependency */
|
||||
tzcnt size, index ; size = # of trailing 0 bits in index
|
||||
movd mm_nbits, nbits ; nbits --> MMX register
|
||||
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
|
||||
inc size ; ++size;
|
||||
por mm_put_buffer, mm_code ; put_buffer |= code;
|
||||
test index, index
|
||||
jnz .BLOOP1 ; } while (index != 0);
|
||||
; Round 2
|
||||
; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
|
||||
.ELOOP1: ; .ELOOP1:
|
||||
pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i);
|
||||
pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i);
|
||||
shl size, 16 ; size <<= 16;
|
||||
or index, size ; index |= size;
|
||||
not index ; index = ~index;
|
||||
lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
|
||||
; nbits = t + 1 + 64;
|
||||
and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */
|
||||
sub nbits, t ; nbits -= t;
|
||||
shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */
|
||||
tzcnt size, index ; size = # of trailing 0 bits in index
|
||||
inc size ; ++size;
|
||||
test index, index ; if (index == 0)
|
||||
jz .ELOOP2 ; goto .ELOOP2;
|
||||
; NOTE: size == 32 cannot happen, since the last element is always 0.
|
||||
shr index, sizeb ; index >>= size;
|
||||
lea size, [size + nbits - 33] ; size = size + nbits - 33;
|
||||
lea t, [t + size * SIZEOF_WORD] ; t += size;
|
||||
cmp size, 16 ; if (size <= 16)
|
||||
jle .ERLOOP2 ; goto .ERLOOP2;
|
||||
.BRLOOP2: ; do {
|
||||
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
|
||||
; nbits = actbl->ehufsi[0xf0];
|
||||
sub size, 16 ; size -= 16;
|
||||
movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
|
||||
; code = actbl->ehufco[0xf0];
|
||||
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
|
||||
jle .EMIT_BRLOOP2 ; insert code and flush put_buffer
|
||||
movd mm_nbits, nbits ; else { nbits --> MMX register
|
||||
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
|
||||
por mm_put_buffer, mm_code ; put_buffer |= code;
|
||||
cmp size, 16 ; if (size <= 16)
|
||||
jle .ERLOOP2 ; goto .ERLOOP2;
|
||||
jmp .BRLOOP2 ; } while (1);
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
.BLOOP2: ; do { /* size = # of zero bits/elements to skip */
|
||||
shr index, sizeb ; index >>= size;
|
||||
lea t, [t + size * SIZEOF_WORD] ; t += size;
|
||||
cmp size, 16 ; if (size > 16)
|
||||
jg .BRLOOP2 ; goto .BRLOOP2;
|
||||
.ERLOOP2: ; .ERLOOP2:
|
||||
movsx nbits, word [t] ; nbits = *t;
|
||||
add size, size ; size += size;
|
||||
movd mm_temp, nbits ; temp = nbits;
|
||||
movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
|
||||
movd mm_nbits, nbits ; nbits --> MMX register
|
||||
lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
|
||||
movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
|
||||
; code = actbl->ehufco[size-16];
|
||||
movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
|
||||
; size = actbl->ehufsi[size-16];
|
||||
psllq mm_code, mm_nbits ; code <<= nbits;
|
||||
pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
|
||||
lea nbits, [nbits + size] ; nbits += size;
|
||||
por mm_code, mm_temp ; code |= temp;
|
||||
xor size, size ; size = 0; /* kill tzcnt input dependency */
|
||||
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
|
||||
jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2
|
||||
tzcnt size, index ; size = # of trailing 0 bits in index
|
||||
movd mm_nbits, nbits ; nbits --> MMX register
|
||||
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
|
||||
inc size ; ++size;
|
||||
por mm_put_buffer, mm_code ; put_buffer |= code;
|
||||
test index, index
|
||||
jnz .BLOOP2 ; } while (index != 0);
|
||||
.ELOOP2: ; .ELOOP2:
|
||||
mov nbits, t ; nbits = t;
|
||||
lea t, [t + SIZEOF_WORD] ; t = &t[1];
|
||||
and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127;
|
||||
and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */
|
||||
cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2)
|
||||
je .EFN ; {
|
||||
movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
|
||||
; code = actbl->ehufco[0];
|
||||
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
|
||||
; nbits = actbl->ehufsi[0];
|
||||
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
|
||||
jg .EFN_SKIP_EMIT_CODE ; {
|
||||
EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer
|
||||
align 16
|
||||
.EFN_SKIP_EMIT_CODE: ; } else {
|
||||
movd mm_nbits, nbits ; nbits --> MMX register
|
||||
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
|
||||
por mm_put_buffer, mm_code ; put_buffer |= code;
|
||||
.EFN: ; } }
|
||||
%define frame esp
|
||||
mov frame, [t + save_frame]
|
||||
%define state ecx
|
||||
mov state, [frame + arg_state]
|
||||
movq [state + working_state.cur.put_buffer.simd], mm_put_buffer
|
||||
; state->cur.put_buffer.simd = put_buffer;
|
||||
emms
|
||||
mov [state + working_state.cur.free_bits], free_bits
|
||||
; state->cur.free_bits = free_bits;
|
||||
POP edi
|
||||
POP esi
|
||||
POP ebp
|
||||
POP ebx
|
||||
ret
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
.EMIT_BRLOOP1:
|
||||
EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \
|
||||
.ERLOOP1
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
.EMIT_ERLOOP1:
|
||||
EMIT_QWORD size, sizeb, sizeh, \
|
||||
{ xor size, size }, \
|
||||
{ tzcnt size, index }, \
|
||||
{ inc size }, \
|
||||
{ test index, index }, \
|
||||
{ jnz .BLOOP1 }, \
|
||||
.ELOOP1
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
.EMIT_BRLOOP2:
|
||||
EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \
|
||||
{ cmp size, 16 }, \
|
||||
{ jle .ERLOOP2 }, \
|
||||
.BRLOOP2
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
align 16
|
||||
.EMIT_ERLOOP2:
|
||||
EMIT_QWORD size, sizeb, sizeh, \
|
||||
{ xor size, size }, \
|
||||
{ tzcnt size, index }, \
|
||||
{ inc size }, \
|
||||
{ test index, index }, \
|
||||
{ jnz .BLOOP2 }, \
|
||||
.ELOOP2
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
Reference in New Issue
Block a user