mirror of
https://github.com/Cateners/tiny_computer.git
synced 2026-05-20 16:35:47 +08:00
Update code to v1.0.14 (10)
This commit is contained in:
148
android/extern/libjpeg-turbo/simd/arm/aarch32/jccolext-neon.c
vendored
Normal file
148
android/extern/libjpeg-turbo/simd/arm/aarch32/jccolext-neon.c
vendored
Normal file
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jccolor-neon.c */
|
||||
|
||||
|
||||
/* RGB -> YCbCr conversion is defined by the following equations:
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
|
||||
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
|
||||
*
|
||||
* Avoid floating point arithmetic by using shifted integer constants:
|
||||
* 0.29899597 = 19595 * 2^-16
|
||||
* 0.58700561 = 38470 * 2^-16
|
||||
* 0.11399841 = 7471 * 2^-16
|
||||
* 0.16874695 = 11059 * 2^-16
|
||||
* 0.33125305 = 21709 * 2^-16
|
||||
* 0.50000000 = 32768 * 2^-16
|
||||
* 0.41868592 = 27439 * 2^-16
|
||||
* 0.08131409 = 5329 * 2^-16
|
||||
* These constants are defined in jccolor-neon.c
|
||||
*
|
||||
* We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
|
||||
* rounds up or down the result via integer truncation.
|
||||
*/
|
||||
|
||||
void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
int num_rows)
|
||||
{
|
||||
/* Pointer to RGB(X/A) input data */
|
||||
JSAMPROW inptr;
|
||||
/* Pointers to Y, Cb, and Cr output data */
|
||||
JSAMPROW outptr0, outptr1, outptr2;
|
||||
/* Allocate temporary buffer for final (image_width % 8) pixels in row. */
|
||||
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
|
||||
|
||||
/* Set up conversion constants. */
|
||||
#ifdef HAVE_VLD1_U16_X2
|
||||
const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
|
||||
#else
|
||||
/* GCC does not currently support the intrinsic vld1_<type>_x2(). */
|
||||
const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
|
||||
const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
|
||||
const uint16x4x2_t consts = { { consts1, consts2 } };
|
||||
#endif
|
||||
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr = *input_buf++;
|
||||
outptr0 = output_buf[0][output_row];
|
||||
outptr1 = output_buf[1][output_row];
|
||||
outptr2 = output_buf[2][output_row];
|
||||
output_row++;
|
||||
|
||||
int cols_remaining = image_width;
|
||||
for (; cols_remaining > 0; cols_remaining -= 8) {
|
||||
|
||||
/* To prevent buffer overread by the vector load instructions, the last
|
||||
* (image_width % 8) columns of data are first memcopied to a temporary
|
||||
* buffer large enough to accommodate the vector load.
|
||||
*/
|
||||
if (cols_remaining < 8) {
|
||||
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
||||
inptr = tmp_buf;
|
||||
}
|
||||
|
||||
#if RGB_PIXELSIZE == 4
|
||||
uint8x8x4_t input_pixels = vld4_u8(inptr);
|
||||
#else
|
||||
uint8x8x3_t input_pixels = vld3_u8(inptr);
|
||||
#endif
|
||||
uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
|
||||
uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
|
||||
uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
|
||||
|
||||
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
|
||||
uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
|
||||
y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
|
||||
y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
|
||||
uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
|
||||
y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
|
||||
y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
|
||||
|
||||
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
|
||||
uint32x4_t cb_low = scaled_128_5;
|
||||
cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
|
||||
cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
|
||||
cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
|
||||
uint32x4_t cb_high = scaled_128_5;
|
||||
cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
|
||||
cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
|
||||
cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
|
||||
|
||||
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
|
||||
uint32x4_t cr_low = scaled_128_5;
|
||||
cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
|
||||
cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
|
||||
cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
|
||||
uint32x4_t cr_high = scaled_128_5;
|
||||
cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
|
||||
cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
|
||||
cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
|
||||
|
||||
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
|
||||
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
|
||||
vrshrn_n_u32(y_high, 16));
|
||||
/* Descale Cb values (right shift) and narrow to 16-bit. */
|
||||
uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
|
||||
vshrn_n_u32(cb_high, 16));
|
||||
/* Descale Cr values (right shift) and narrow to 16-bit. */
|
||||
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
|
||||
vshrn_n_u32(cr_high, 16));
|
||||
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
|
||||
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
|
||||
*/
|
||||
vst1_u8(outptr0, vmovn_u16(y_u16));
|
||||
vst1_u8(outptr1, vmovn_u16(cb_u16));
|
||||
vst1_u8(outptr2, vmovn_u16(cr_u16));
|
||||
|
||||
/* Increment pointers. */
|
||||
inptr += (8 * RGB_PIXELSIZE);
|
||||
outptr0 += 8;
|
||||
outptr1 += 8;
|
||||
outptr2 += 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
334
android/extern/libjpeg-turbo/simd/arm/aarch32/jchuff-neon.c
vendored
Normal file
334
android/extern/libjpeg-turbo/simd/arm/aarch32/jchuff-neon.c
vendored
Normal file
@@ -0,0 +1,334 @@
|
||||
/*
|
||||
* jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*
|
||||
* NOTE: All referenced figures are from
|
||||
* Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../../jinclude.h"
|
||||
#include "../../../jpeglib.h"
|
||||
#include "../../../jsimd.h"
|
||||
#include "../../../jdct.h"
|
||||
#include "../../../jsimddct.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../jchuff.h"
|
||||
#include "neon-compat.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
||||
JCOEFPTR block, int last_dc_val,
|
||||
c_derived_tbl *dctbl,
|
||||
c_derived_tbl *actbl)
|
||||
{
|
||||
uint8_t block_nbits[DCTSIZE2];
|
||||
uint16_t block_diff[DCTSIZE2];
|
||||
|
||||
/* Load rows of coefficients from DCT block in zig-zag order. */
|
||||
|
||||
/* Compute DC coefficient difference value. (F.1.1.5.1) */
|
||||
int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
|
||||
row0 = vld1q_lane_s16(block + 1, row0, 1);
|
||||
row0 = vld1q_lane_s16(block + 8, row0, 2);
|
||||
row0 = vld1q_lane_s16(block + 16, row0, 3);
|
||||
row0 = vld1q_lane_s16(block + 9, row0, 4);
|
||||
row0 = vld1q_lane_s16(block + 2, row0, 5);
|
||||
row0 = vld1q_lane_s16(block + 3, row0, 6);
|
||||
row0 = vld1q_lane_s16(block + 10, row0, 7);
|
||||
|
||||
int16x8_t row1 = vld1q_dup_s16(block + 17);
|
||||
row1 = vld1q_lane_s16(block + 24, row1, 1);
|
||||
row1 = vld1q_lane_s16(block + 32, row1, 2);
|
||||
row1 = vld1q_lane_s16(block + 25, row1, 3);
|
||||
row1 = vld1q_lane_s16(block + 18, row1, 4);
|
||||
row1 = vld1q_lane_s16(block + 11, row1, 5);
|
||||
row1 = vld1q_lane_s16(block + 4, row1, 6);
|
||||
row1 = vld1q_lane_s16(block + 5, row1, 7);
|
||||
|
||||
int16x8_t row2 = vld1q_dup_s16(block + 12);
|
||||
row2 = vld1q_lane_s16(block + 19, row2, 1);
|
||||
row2 = vld1q_lane_s16(block + 26, row2, 2);
|
||||
row2 = vld1q_lane_s16(block + 33, row2, 3);
|
||||
row2 = vld1q_lane_s16(block + 40, row2, 4);
|
||||
row2 = vld1q_lane_s16(block + 48, row2, 5);
|
||||
row2 = vld1q_lane_s16(block + 41, row2, 6);
|
||||
row2 = vld1q_lane_s16(block + 34, row2, 7);
|
||||
|
||||
int16x8_t row3 = vld1q_dup_s16(block + 27);
|
||||
row3 = vld1q_lane_s16(block + 20, row3, 1);
|
||||
row3 = vld1q_lane_s16(block + 13, row3, 2);
|
||||
row3 = vld1q_lane_s16(block + 6, row3, 3);
|
||||
row3 = vld1q_lane_s16(block + 7, row3, 4);
|
||||
row3 = vld1q_lane_s16(block + 14, row3, 5);
|
||||
row3 = vld1q_lane_s16(block + 21, row3, 6);
|
||||
row3 = vld1q_lane_s16(block + 28, row3, 7);
|
||||
|
||||
int16x8_t abs_row0 = vabsq_s16(row0);
|
||||
int16x8_t abs_row1 = vabsq_s16(row1);
|
||||
int16x8_t abs_row2 = vabsq_s16(row2);
|
||||
int16x8_t abs_row3 = vabsq_s16(row3);
|
||||
|
||||
int16x8_t row0_lz = vclzq_s16(abs_row0);
|
||||
int16x8_t row1_lz = vclzq_s16(abs_row1);
|
||||
int16x8_t row2_lz = vclzq_s16(abs_row2);
|
||||
int16x8_t row3_lz = vclzq_s16(abs_row3);
|
||||
|
||||
/* Compute number of bits required to represent each coefficient. */
|
||||
uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
|
||||
vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
|
||||
uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
|
||||
vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
|
||||
uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
|
||||
vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
|
||||
uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
|
||||
vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
|
||||
|
||||
vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
|
||||
vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
|
||||
vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
|
||||
vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
|
||||
|
||||
uint16x8_t row0_mask =
|
||||
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
|
||||
vnegq_s16(row0_lz));
|
||||
uint16x8_t row1_mask =
|
||||
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
|
||||
vnegq_s16(row1_lz));
|
||||
uint16x8_t row2_mask =
|
||||
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
|
||||
vnegq_s16(row2_lz));
|
||||
uint16x8_t row3_mask =
|
||||
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
|
||||
vnegq_s16(row3_lz));
|
||||
|
||||
uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
|
||||
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
|
||||
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
|
||||
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
|
||||
|
||||
/* Store diff values for rows 0, 1, 2, and 3. */
|
||||
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
|
||||
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
|
||||
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
|
||||
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
|
||||
|
||||
/* Load last four rows of coefficients from DCT block in zig-zag order. */
|
||||
int16x8_t row4 = vld1q_dup_s16(block + 35);
|
||||
row4 = vld1q_lane_s16(block + 42, row4, 1);
|
||||
row4 = vld1q_lane_s16(block + 49, row4, 2);
|
||||
row4 = vld1q_lane_s16(block + 56, row4, 3);
|
||||
row4 = vld1q_lane_s16(block + 57, row4, 4);
|
||||
row4 = vld1q_lane_s16(block + 50, row4, 5);
|
||||
row4 = vld1q_lane_s16(block + 43, row4, 6);
|
||||
row4 = vld1q_lane_s16(block + 36, row4, 7);
|
||||
|
||||
int16x8_t row5 = vld1q_dup_s16(block + 29);
|
||||
row5 = vld1q_lane_s16(block + 22, row5, 1);
|
||||
row5 = vld1q_lane_s16(block + 15, row5, 2);
|
||||
row5 = vld1q_lane_s16(block + 23, row5, 3);
|
||||
row5 = vld1q_lane_s16(block + 30, row5, 4);
|
||||
row5 = vld1q_lane_s16(block + 37, row5, 5);
|
||||
row5 = vld1q_lane_s16(block + 44, row5, 6);
|
||||
row5 = vld1q_lane_s16(block + 51, row5, 7);
|
||||
|
||||
int16x8_t row6 = vld1q_dup_s16(block + 58);
|
||||
row6 = vld1q_lane_s16(block + 59, row6, 1);
|
||||
row6 = vld1q_lane_s16(block + 52, row6, 2);
|
||||
row6 = vld1q_lane_s16(block + 45, row6, 3);
|
||||
row6 = vld1q_lane_s16(block + 38, row6, 4);
|
||||
row6 = vld1q_lane_s16(block + 31, row6, 5);
|
||||
row6 = vld1q_lane_s16(block + 39, row6, 6);
|
||||
row6 = vld1q_lane_s16(block + 46, row6, 7);
|
||||
|
||||
int16x8_t row7 = vld1q_dup_s16(block + 53);
|
||||
row7 = vld1q_lane_s16(block + 60, row7, 1);
|
||||
row7 = vld1q_lane_s16(block + 61, row7, 2);
|
||||
row7 = vld1q_lane_s16(block + 54, row7, 3);
|
||||
row7 = vld1q_lane_s16(block + 47, row7, 4);
|
||||
row7 = vld1q_lane_s16(block + 55, row7, 5);
|
||||
row7 = vld1q_lane_s16(block + 62, row7, 6);
|
||||
row7 = vld1q_lane_s16(block + 63, row7, 7);
|
||||
|
||||
int16x8_t abs_row4 = vabsq_s16(row4);
|
||||
int16x8_t abs_row5 = vabsq_s16(row5);
|
||||
int16x8_t abs_row6 = vabsq_s16(row6);
|
||||
int16x8_t abs_row7 = vabsq_s16(row7);
|
||||
|
||||
int16x8_t row4_lz = vclzq_s16(abs_row4);
|
||||
int16x8_t row5_lz = vclzq_s16(abs_row5);
|
||||
int16x8_t row6_lz = vclzq_s16(abs_row6);
|
||||
int16x8_t row7_lz = vclzq_s16(abs_row7);
|
||||
|
||||
/* Compute number of bits required to represent each coefficient. */
|
||||
uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
|
||||
vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
|
||||
uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
|
||||
vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
|
||||
uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
|
||||
vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
|
||||
uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
|
||||
vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
|
||||
|
||||
vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
|
||||
vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
|
||||
vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
|
||||
vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
|
||||
|
||||
uint16x8_t row4_mask =
|
||||
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
|
||||
vnegq_s16(row4_lz));
|
||||
uint16x8_t row5_mask =
|
||||
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
|
||||
vnegq_s16(row5_lz));
|
||||
uint16x8_t row6_mask =
|
||||
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
|
||||
vnegq_s16(row6_lz));
|
||||
uint16x8_t row7_mask =
|
||||
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
|
||||
vnegq_s16(row7_lz));
|
||||
|
||||
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
|
||||
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
|
||||
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
|
||||
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
|
||||
|
||||
/* Store diff values for rows 4, 5, 6, and 7. */
|
||||
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
|
||||
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
|
||||
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
|
||||
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
|
||||
|
||||
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
|
||||
* means that the corresponding coefficient != 0.
|
||||
*/
|
||||
uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
|
||||
uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
|
||||
uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
|
||||
uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
|
||||
uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
|
||||
uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
|
||||
uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
|
||||
uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
|
||||
|
||||
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
|
||||
const uint8x8_t bitmap_mask =
|
||||
vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
|
||||
|
||||
row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
|
||||
row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
|
||||
row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
|
||||
row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
|
||||
row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
|
||||
row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
|
||||
row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
|
||||
row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
|
||||
|
||||
uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
|
||||
uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
|
||||
uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
|
||||
uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
|
||||
uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
|
||||
uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
|
||||
uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
|
||||
|
||||
/* Shift left to remove DC bit. */
|
||||
bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
|
||||
/* Move bitmap to 32-bit scalar registers. */
|
||||
uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
|
||||
uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
|
||||
|
||||
/* Set up state and bit buffer for output bitstream. */
|
||||
working_state *state_ptr = (working_state *)state;
|
||||
int free_bits = state_ptr->cur.free_bits;
|
||||
size_t put_buffer = state_ptr->cur.put_buffer;
|
||||
|
||||
/* Encode DC coefficient. */
|
||||
|
||||
unsigned int nbits = block_nbits[0];
|
||||
/* Emit Huffman-coded symbol and additional diff bits. */
|
||||
unsigned int diff = block_diff[0];
|
||||
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
|
||||
|
||||
/* Encode AC coefficients. */
|
||||
|
||||
unsigned int r = 0; /* r = run length of zeros */
|
||||
unsigned int i = 1; /* i = number of coefficients encoded */
|
||||
/* Code and size information for a run length of 16 zero coefficients */
|
||||
const unsigned int code_0xf0 = actbl->ehufco[0xf0];
|
||||
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
|
||||
|
||||
while (bitmap_1_32 != 0) {
|
||||
r = BUILTIN_CLZ(bitmap_1_32);
|
||||
i += r;
|
||||
bitmap_1_32 <<= r;
|
||||
nbits = block_nbits[i];
|
||||
diff = block_diff[i];
|
||||
while (r > 15) {
|
||||
/* If run length > 15, emit special run-length-16 codes. */
|
||||
PUT_BITS(code_0xf0, size_0xf0)
|
||||
r -= 16;
|
||||
}
|
||||
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
|
||||
unsigned int rs = (r << 4) + nbits;
|
||||
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
|
||||
i++;
|
||||
bitmap_1_32 <<= 1;
|
||||
}
|
||||
|
||||
r = 33 - i;
|
||||
i = 33;
|
||||
|
||||
while (bitmap_33_63 != 0) {
|
||||
unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
|
||||
r += leading_zeros;
|
||||
i += leading_zeros;
|
||||
bitmap_33_63 <<= leading_zeros;
|
||||
nbits = block_nbits[i];
|
||||
diff = block_diff[i];
|
||||
while (r > 15) {
|
||||
/* If run length > 15, emit special run-length-16 codes. */
|
||||
PUT_BITS(code_0xf0, size_0xf0)
|
||||
r -= 16;
|
||||
}
|
||||
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
|
||||
unsigned int rs = (r << 4) + nbits;
|
||||
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
|
||||
r = 0;
|
||||
i++;
|
||||
bitmap_33_63 <<= 1;
|
||||
}
|
||||
|
||||
/* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
|
||||
* The value of RS for the EOB code is 0.
|
||||
*/
|
||||
if (i != 64) {
|
||||
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
|
||||
}
|
||||
|
||||
state_ptr->cur.put_buffer = put_buffer;
|
||||
state_ptr->cur.free_bits = free_bits;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
976
android/extern/libjpeg-turbo/simd/arm/aarch32/jsimd.c
vendored
Normal file
976
android/extern/libjpeg-turbo/simd/arm/aarch32/jsimd.c
vendored
Normal file
@@ -0,0 +1,976 @@
|
||||
/*
|
||||
* jsimd_arm.c
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
|
||||
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
|
||||
* Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
|
||||
* Copyright (C) 2019, Google LLC.
|
||||
* Copyright (C) 2020, Arm Limited.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
* For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
*
|
||||
* This file contains the interface between the "normal" portions
|
||||
* of the library and the SIMD implementations when running on a
|
||||
* 32-bit Arm architecture.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../../jinclude.h"
|
||||
#include "../../../jpeglib.h"
|
||||
#include "../../../jsimd.h"
|
||||
#include "../../../jdct.h"
|
||||
#include "../../../jsimddct.h"
|
||||
#include "../../jsimd.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
static THREAD_LOCAL unsigned int simd_support = ~0;
|
||||
static THREAD_LOCAL unsigned int simd_huffman = 1;
|
||||
|
||||
#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
|
||||
|
||||
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
|
||||
|
||||
LOCAL(int)
|
||||
check_feature(char *buffer, char *feature)
|
||||
{
|
||||
char *p;
|
||||
|
||||
if (*feature == 0)
|
||||
return 0;
|
||||
if (strncmp(buffer, "Features", 8) != 0)
|
||||
return 0;
|
||||
buffer += 8;
|
||||
while (isspace(*buffer))
|
||||
buffer++;
|
||||
|
||||
/* Check if 'feature' is present in the buffer as a separate word */
|
||||
while ((p = strstr(buffer, feature))) {
|
||||
if (p > buffer && !isspace(*(p - 1))) {
|
||||
buffer++;
|
||||
continue;
|
||||
}
|
||||
p += strlen(feature);
|
||||
if (*p != 0 && !isspace(*p)) {
|
||||
buffer++;
|
||||
continue;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LOCAL(int)
|
||||
parse_proc_cpuinfo(int bufsize)
|
||||
{
|
||||
char *buffer = (char *)malloc(bufsize);
|
||||
FILE *fd;
|
||||
|
||||
simd_support = 0;
|
||||
|
||||
if (!buffer)
|
||||
return 0;
|
||||
|
||||
fd = fopen("/proc/cpuinfo", "r");
|
||||
if (fd) {
|
||||
while (fgets(buffer, bufsize, fd)) {
|
||||
if (!strchr(buffer, '\n') && !feof(fd)) {
|
||||
/* "impossible" happened - insufficient size of the buffer! */
|
||||
fclose(fd);
|
||||
free(buffer);
|
||||
return 0;
|
||||
}
|
||||
if (check_feature(buffer, "neon"))
|
||||
simd_support |= JSIMD_NEON;
|
||||
}
|
||||
fclose(fd);
|
||||
}
|
||||
free(buffer);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Check what SIMD accelerations are supported.
|
||||
*/
|
||||
LOCAL(void)
|
||||
init_simd(void)
|
||||
{
|
||||
#ifndef NO_GETENV
|
||||
char env[2] = { 0 };
|
||||
#endif
|
||||
#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
|
||||
int bufsize = 1024; /* an initial guess for the line buffer size limit */
|
||||
#endif
|
||||
|
||||
if (simd_support != ~0U)
|
||||
return;
|
||||
|
||||
simd_support = 0;
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
simd_support |= JSIMD_NEON;
|
||||
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
|
||||
/* We still have a chance to use Neon regardless of globally used
|
||||
* -mcpu/-mfpu options passed to gcc by performing runtime detection via
|
||||
* /proc/cpuinfo parsing on linux/android */
|
||||
while (!parse_proc_cpuinfo(bufsize)) {
|
||||
bufsize *= 2;
|
||||
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef NO_GETENV
|
||||
/* Force different settings through environment variables */
|
||||
if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
|
||||
simd_support = JSIMD_NEON;
|
||||
if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
|
||||
simd_support = 0;
|
||||
if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
|
||||
simd_huffman = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_ycc(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_gray(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_ycc_rgb(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_ycc_rgb565(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
int num_rows)
|
||||
{
|
||||
void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
|
||||
switch (cinfo->in_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
neonfct = jsimd_extrgb_ycc_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
neonfct = jsimd_extrgbx_ycc_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
neonfct = jsimd_extbgr_ycc_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
neonfct = jsimd_extbgrx_ycc_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
neonfct = jsimd_extxbgr_ycc_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
neonfct = jsimd_extxrgb_ycc_convert_neon;
|
||||
break;
|
||||
default:
|
||||
neonfct = jsimd_extrgb_ycc_convert_neon;
|
||||
break;
|
||||
}
|
||||
|
||||
neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
int num_rows)
|
||||
{
|
||||
void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
|
||||
switch (cinfo->in_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
neonfct = jsimd_extrgb_gray_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
neonfct = jsimd_extrgbx_gray_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
neonfct = jsimd_extbgr_gray_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
neonfct = jsimd_extbgrx_gray_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
neonfct = jsimd_extxbgr_gray_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
neonfct = jsimd_extxrgb_gray_convert_neon;
|
||||
break;
|
||||
default:
|
||||
neonfct = jsimd_extrgb_gray_convert_neon;
|
||||
break;
|
||||
}
|
||||
|
||||
neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
int num_rows)
|
||||
{
|
||||
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
|
||||
|
||||
switch (cinfo->out_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
neonfct = jsimd_ycc_extrgb_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
neonfct = jsimd_ycc_extrgbx_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
neonfct = jsimd_ycc_extbgr_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
neonfct = jsimd_ycc_extbgrx_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
neonfct = jsimd_ycc_extxbgr_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
neonfct = jsimd_ycc_extxrgb_convert_neon;
|
||||
break;
|
||||
default:
|
||||
neonfct = jsimd_ycc_extrgb_convert_neon;
|
||||
break;
|
||||
}
|
||||
|
||||
neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
int num_rows)
|
||||
{
|
||||
jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
|
||||
output_buf, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_downsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_downsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor, compptr->width_in_blocks,
|
||||
input_data, output_data);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor, compptr->width_in_blocks,
|
||||
input_data, output_data);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||
input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
|
||||
input_data, output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_fancy_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_fancy_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h1v2_fancy_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data,
|
||||
output_data_ptr);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_merged_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_merged_upsample(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
||||
{
|
||||
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||
|
||||
switch (cinfo->out_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
|
||||
break;
|
||||
default:
|
||||
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
|
||||
break;
|
||||
}
|
||||
|
||||
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
|
||||
{
|
||||
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||
|
||||
switch (cinfo->out_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
|
||||
break;
|
||||
default:
|
||||
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
|
||||
break;
|
||||
}
|
||||
|
||||
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_convsamp(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_convsamp_float(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM *workspace)
|
||||
{
|
||||
jsimd_convsamp_neon(sample_data, start_col, workspace);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
FAST_FLOAT *workspace)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_islow(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_ifast(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_float(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_islow(DCTELEM *data)
|
||||
{
|
||||
jsimd_fdct_islow_neon(data);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_ifast(DCTELEM *data)
|
||||
{
|
||||
jsimd_fdct_ifast_neon(data);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_float(FAST_FLOAT *data)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_quantize(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_quantize_float(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
|
||||
{
|
||||
jsimd_quantize_neon(coef_block, divisors, workspace);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
|
||||
FAST_FLOAT *workspace)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_2x2(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(ISLOW_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_4x4(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(ISLOW_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_islow(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(ISLOW_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_ifast(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(IFAST_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
if (IFAST_SCALE_BITS != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_float(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
|
||||
output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
|
||||
output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_huff_encode_one_block(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON && simd_huffman)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(JOCTET *)
|
||||
jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
int last_dc_val, c_derived_tbl *dctbl,
|
||||
c_derived_tbl *actbl)
|
||||
{
|
||||
return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
|
||||
dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_first_prepare(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, UJCOEF *values, size_t *zerobits)
|
||||
{
|
||||
jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
|
||||
Sl, Al, values, zerobits);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, UJCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return jsimd_encode_mcu_AC_refine_prepare_neon(block,
|
||||
jpeg_natural_order_start, Sl,
|
||||
Al, absvalues, bits);
|
||||
}
|
||||
1200
android/extern/libjpeg-turbo/simd/arm/aarch32/jsimd_neon.S
vendored
Normal file
1200
android/extern/libjpeg-turbo/simd/arm/aarch32/jsimd_neon.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
316
android/extern/libjpeg-turbo/simd/arm/aarch64/jccolext-neon.c
vendored
Normal file
316
android/extern/libjpeg-turbo/simd/arm/aarch64/jccolext-neon.c
vendored
Normal file
@@ -0,0 +1,316 @@
|
||||
/*
|
||||
* jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jccolor-neon.c */
|
||||
|
||||
|
||||
/* RGB -> YCbCr conversion is defined by the following equations:
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
|
||||
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
|
||||
*
|
||||
* Avoid floating point arithmetic by using shifted integer constants:
|
||||
* 0.29899597 = 19595 * 2^-16
|
||||
* 0.58700561 = 38470 * 2^-16
|
||||
* 0.11399841 = 7471 * 2^-16
|
||||
* 0.16874695 = 11059 * 2^-16
|
||||
* 0.33125305 = 21709 * 2^-16
|
||||
* 0.50000000 = 32768 * 2^-16
|
||||
* 0.41868592 = 27439 * 2^-16
|
||||
* 0.08131409 = 5329 * 2^-16
|
||||
* These constants are defined in jccolor-neon.c
|
||||
*
|
||||
* We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
|
||||
* rounds up or down the result via integer truncation.
|
||||
*/
|
||||
|
||||
void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
int num_rows)
|
||||
{
|
||||
/* Pointer to RGB(X/A) input data */
|
||||
JSAMPROW inptr;
|
||||
/* Pointers to Y, Cb, and Cr output data */
|
||||
JSAMPROW outptr0, outptr1, outptr2;
|
||||
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
|
||||
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
|
||||
|
||||
/* Set up conversion constants. */
|
||||
const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
|
||||
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr = *input_buf++;
|
||||
outptr0 = output_buf[0][output_row];
|
||||
outptr1 = output_buf[1][output_row];
|
||||
outptr2 = output_buf[2][output_row];
|
||||
output_row++;
|
||||
|
||||
int cols_remaining = image_width;
|
||||
for (; cols_remaining >= 16; cols_remaining -= 16) {
|
||||
|
||||
#if RGB_PIXELSIZE == 4
|
||||
uint8x16x4_t input_pixels = vld4q_u8(inptr);
|
||||
#else
|
||||
uint8x16x3_t input_pixels = vld3q_u8(inptr);
|
||||
#endif
|
||||
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
|
||||
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
|
||||
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
|
||||
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
|
||||
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
|
||||
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
|
||||
|
||||
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
|
||||
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
|
||||
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
|
||||
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
|
||||
uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
|
||||
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
|
||||
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
|
||||
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
|
||||
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
|
||||
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
|
||||
uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
|
||||
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
|
||||
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
|
||||
|
||||
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
|
||||
uint32x4_t cb_ll = scaled_128_5;
|
||||
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
|
||||
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
|
||||
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
|
||||
uint32x4_t cb_lh = scaled_128_5;
|
||||
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
|
||||
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
|
||||
cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
|
||||
uint32x4_t cb_hl = scaled_128_5;
|
||||
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
|
||||
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
|
||||
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
|
||||
uint32x4_t cb_hh = scaled_128_5;
|
||||
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
|
||||
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
|
||||
cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
|
||||
|
||||
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
|
||||
uint32x4_t cr_ll = scaled_128_5;
|
||||
cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
|
||||
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
|
||||
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
|
||||
uint32x4_t cr_lh = scaled_128_5;
|
||||
cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
|
||||
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
|
||||
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
|
||||
uint32x4_t cr_hl = scaled_128_5;
|
||||
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
|
||||
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
|
||||
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
|
||||
uint32x4_t cr_hh = scaled_128_5;
|
||||
cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
|
||||
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
|
||||
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
|
||||
|
||||
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
|
||||
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
|
||||
vrshrn_n_u32(y_lh, 16));
|
||||
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
|
||||
vrshrn_n_u32(y_hh, 16));
|
||||
/* Descale Cb values (right shift) and narrow to 16-bit. */
|
||||
uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
|
||||
vshrn_n_u32(cb_lh, 16));
|
||||
uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
|
||||
vshrn_n_u32(cb_hh, 16));
|
||||
/* Descale Cr values (right shift) and narrow to 16-bit. */
|
||||
uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
|
||||
vshrn_n_u32(cr_lh, 16));
|
||||
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
|
||||
vshrn_n_u32(cr_hh, 16));
|
||||
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
|
||||
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
|
||||
*/
|
||||
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
|
||||
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
|
||||
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
|
||||
|
||||
/* Increment pointers. */
|
||||
inptr += (16 * RGB_PIXELSIZE);
|
||||
outptr0 += 16;
|
||||
outptr1 += 16;
|
||||
outptr2 += 16;
|
||||
}
|
||||
|
||||
if (cols_remaining > 8) {
|
||||
/* To prevent buffer overread by the vector load instructions, the last
|
||||
* (image_width % 16) columns of data are first memcopied to a temporary
|
||||
* buffer large enough to accommodate the vector load.
|
||||
*/
|
||||
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
||||
inptr = tmp_buf;
|
||||
|
||||
#if RGB_PIXELSIZE == 4
|
||||
uint8x16x4_t input_pixels = vld4q_u8(inptr);
|
||||
#else
|
||||
uint8x16x3_t input_pixels = vld3q_u8(inptr);
|
||||
#endif
|
||||
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
|
||||
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
|
||||
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
|
||||
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
|
||||
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
|
||||
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
|
||||
|
||||
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
|
||||
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
|
||||
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
|
||||
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
|
||||
uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
|
||||
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
|
||||
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
|
||||
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
|
||||
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
|
||||
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
|
||||
uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
|
||||
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
|
||||
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
|
||||
|
||||
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
|
||||
uint32x4_t cb_ll = scaled_128_5;
|
||||
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
|
||||
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
|
||||
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
|
||||
uint32x4_t cb_lh = scaled_128_5;
|
||||
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
|
||||
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
|
||||
cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
|
||||
uint32x4_t cb_hl = scaled_128_5;
|
||||
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
|
||||
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
|
||||
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
|
||||
uint32x4_t cb_hh = scaled_128_5;
|
||||
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
|
||||
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
|
||||
cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
|
||||
|
||||
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
|
||||
uint32x4_t cr_ll = scaled_128_5;
|
||||
cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
|
||||
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
|
||||
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
|
||||
uint32x4_t cr_lh = scaled_128_5;
|
||||
cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
|
||||
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
|
||||
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
|
||||
uint32x4_t cr_hl = scaled_128_5;
|
||||
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
|
||||
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
|
||||
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
|
||||
uint32x4_t cr_hh = scaled_128_5;
|
||||
cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
|
||||
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
|
||||
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
|
||||
|
||||
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
|
||||
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
|
||||
vrshrn_n_u32(y_lh, 16));
|
||||
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
|
||||
vrshrn_n_u32(y_hh, 16));
|
||||
/* Descale Cb values (right shift) and narrow to 16-bit. */
|
||||
uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
|
||||
vshrn_n_u32(cb_lh, 16));
|
||||
uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
|
||||
vshrn_n_u32(cb_hh, 16));
|
||||
/* Descale Cr values (right shift) and narrow to 16-bit. */
|
||||
uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
|
||||
vshrn_n_u32(cr_lh, 16));
|
||||
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
|
||||
vshrn_n_u32(cr_hh, 16));
|
||||
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
|
||||
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
|
||||
*/
|
||||
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
|
||||
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
|
||||
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
|
||||
|
||||
} else if (cols_remaining > 0) {
|
||||
/* To prevent buffer overread by the vector load instructions, the last
|
||||
* (image_width % 8) columns of data are first memcopied to a temporary
|
||||
* buffer large enough to accommodate the vector load.
|
||||
*/
|
||||
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
||||
inptr = tmp_buf;
|
||||
|
||||
#if RGB_PIXELSIZE == 4
|
||||
uint8x8x4_t input_pixels = vld4_u8(inptr);
|
||||
#else
|
||||
uint8x8x3_t input_pixels = vld3_u8(inptr);
|
||||
#endif
|
||||
uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
|
||||
uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
|
||||
uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
|
||||
|
||||
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
|
||||
uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
|
||||
y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
|
||||
y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
|
||||
uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
|
||||
y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
|
||||
y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
|
||||
|
||||
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
|
||||
uint32x4_t cb_l = scaled_128_5;
|
||||
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
|
||||
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
|
||||
cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
|
||||
uint32x4_t cb_h = scaled_128_5;
|
||||
cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
|
||||
cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
|
||||
cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
|
||||
|
||||
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
|
||||
uint32x4_t cr_l = scaled_128_5;
|
||||
cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
|
||||
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
|
||||
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
|
||||
uint32x4_t cr_h = scaled_128_5;
|
||||
cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
|
||||
cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
|
||||
cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
|
||||
|
||||
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
|
||||
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
|
||||
vrshrn_n_u32(y_h, 16));
|
||||
/* Descale Cb values (right shift) and narrow to 16-bit. */
|
||||
uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
|
||||
vshrn_n_u32(cb_h, 16));
|
||||
/* Descale Cr values (right shift) and narrow to 16-bit. */
|
||||
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
|
||||
vshrn_n_u32(cr_h, 16));
|
||||
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
|
||||
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
|
||||
*/
|
||||
vst1_u8(outptr0, vmovn_u16(y_u16));
|
||||
vst1_u8(outptr1, vmovn_u16(cb_u16));
|
||||
vst1_u8(outptr2, vmovn_u16(cr_u16));
|
||||
}
|
||||
}
|
||||
}
|
||||
411
android/extern/libjpeg-turbo/simd/arm/aarch64/jchuff-neon.c
vendored
Normal file
411
android/extern/libjpeg-turbo/simd/arm/aarch64/jchuff-neon.c
vendored
Normal file
@@ -0,0 +1,411 @@
|
||||
/*
|
||||
* jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*
|
||||
* NOTE: All referenced figures are from
|
||||
* Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../../jinclude.h"
|
||||
#include "../../../jpeglib.h"
|
||||
#include "../../../jsimd.h"
|
||||
#include "../../../jdct.h"
|
||||
#include "../../../jsimddct.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../align.h"
|
||||
#include "../jchuff.h"
|
||||
#include "neon-compat.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
|
||||
0, 1, 2, 3, 16, 17, 32, 33,
|
||||
18, 19, 4, 5, 6, 7, 20, 21,
|
||||
34, 35, 48, 49, 255, 255, 50, 51,
|
||||
36, 37, 22, 23, 8, 9, 10, 11,
|
||||
255, 255, 6, 7, 20, 21, 34, 35,
|
||||
48, 49, 255, 255, 50, 51, 36, 37,
|
||||
54, 55, 40, 41, 26, 27, 12, 13,
|
||||
14, 15, 28, 29, 42, 43, 56, 57,
|
||||
6, 7, 20, 21, 34, 35, 48, 49,
|
||||
50, 51, 36, 37, 22, 23, 8, 9,
|
||||
26, 27, 12, 13, 255, 255, 14, 15,
|
||||
28, 29, 42, 43, 56, 57, 255, 255,
|
||||
52, 53, 54, 55, 40, 41, 26, 27,
|
||||
12, 13, 255, 255, 14, 15, 28, 29,
|
||||
26, 27, 40, 41, 42, 43, 28, 29,
|
||||
14, 15, 30, 31, 44, 45, 46, 47
|
||||
};
|
||||
|
||||
/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
|
||||
* address warning because the macro sometimes writes a 64-bit value to a
|
||||
* non-64-bit-aligned address. That behavior is technically undefined per
|
||||
* the C specification, but it is supported by the AArch64 architecture and
|
||||
* compilers.
|
||||
*/
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(undefined_behavior_sanitizer)
|
||||
__attribute__((no_sanitize("alignment")))
|
||||
#endif
|
||||
#endif
|
||||
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
|
||||
JCOEFPTR block, int last_dc_val,
|
||||
c_derived_tbl *dctbl,
|
||||
c_derived_tbl *actbl)
|
||||
{
|
||||
uint16_t block_diff[DCTSIZE2];
|
||||
|
||||
/* Load lookup table indices for rows of zig-zag ordering. */
|
||||
#ifdef HAVE_VLD1Q_U8_X4
|
||||
const uint8x16x4_t idx_rows_0123 =
|
||||
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
|
||||
const uint8x16x4_t idx_rows_4567 =
|
||||
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
|
||||
#else
|
||||
/* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
|
||||
const uint8x16x4_t idx_rows_0123 = { {
|
||||
vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
|
||||
vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
|
||||
vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
|
||||
vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
|
||||
} };
|
||||
const uint8x16x4_t idx_rows_4567 = { {
|
||||
vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
|
||||
vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
|
||||
vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
|
||||
vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
|
||||
} };
|
||||
#endif
|
||||
|
||||
/* Load 8x8 block of DCT coefficients. */
|
||||
#ifdef HAVE_VLD1Q_U8_X4
|
||||
const int8x16x4_t tbl_rows_0123 =
|
||||
vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
|
||||
const int8x16x4_t tbl_rows_4567 =
|
||||
vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
|
||||
#else
|
||||
const int8x16x4_t tbl_rows_0123 = { {
|
||||
vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
|
||||
vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
|
||||
vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
|
||||
vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
|
||||
} };
|
||||
const int8x16x4_t tbl_rows_4567 = { {
|
||||
vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
|
||||
vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
|
||||
vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
|
||||
vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
|
||||
} };
|
||||
#endif
|
||||
|
||||
/* Initialise extra lookup tables. */
|
||||
const int8x16x4_t tbl_rows_2345 = { {
|
||||
tbl_rows_0123.val[2], tbl_rows_0123.val[3],
|
||||
tbl_rows_4567.val[0], tbl_rows_4567.val[1]
|
||||
} };
|
||||
const int8x16x3_t tbl_rows_567 =
|
||||
{ { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
|
||||
|
||||
/* Shuffle coefficients into zig-zag order. */
|
||||
int16x8_t row0 =
|
||||
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
|
||||
int16x8_t row1 =
|
||||
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
|
||||
int16x8_t row2 =
|
||||
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
|
||||
int16x8_t row3 =
|
||||
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
|
||||
int16x8_t row4 =
|
||||
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
|
||||
int16x8_t row5 =
|
||||
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
|
||||
int16x8_t row6 =
|
||||
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
|
||||
int16x8_t row7 =
|
||||
vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
|
||||
|
||||
/* Compute DC coefficient difference value (F.1.1.5.1). */
|
||||
row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
|
||||
/* Initialize AC coefficient lanes not reachable by lookup tables. */
|
||||
row1 =
|
||||
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
|
||||
0), row1, 2);
|
||||
row2 =
|
||||
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
|
||||
4), row2, 0);
|
||||
row2 =
|
||||
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
|
||||
0), row2, 5);
|
||||
row5 =
|
||||
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
|
||||
7), row5, 2);
|
||||
row5 =
|
||||
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
|
||||
3), row5, 7);
|
||||
row6 =
|
||||
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
|
||||
7), row6, 5);
|
||||
|
||||
/* DCT block is now in zig-zag order; start Huffman encoding process. */
|
||||
|
||||
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
|
||||
* means that the corresponding coefficient != 0.
|
||||
*/
|
||||
uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
|
||||
uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
|
||||
uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
|
||||
uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
|
||||
uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
|
||||
uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
|
||||
uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
|
||||
uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
|
||||
|
||||
uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
|
||||
vreinterpretq_u8_u16(row0_ne_0));
|
||||
uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
|
||||
vreinterpretq_u8_u16(row2_ne_0));
|
||||
uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
|
||||
vreinterpretq_u8_u16(row4_ne_0));
|
||||
uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
|
||||
vreinterpretq_u8_u16(row6_ne_0));
|
||||
|
||||
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
|
||||
const uint8x16_t bitmap_mask =
|
||||
vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
|
||||
|
||||
uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
|
||||
uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
|
||||
uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
|
||||
uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
|
||||
|
||||
uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
|
||||
uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
|
||||
uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
|
||||
bitmap_rows_3210);
|
||||
uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
|
||||
vget_high_u8(bitmap_rows_76543210));
|
||||
|
||||
/* Shift left to remove DC bit. */
|
||||
bitmap_all =
|
||||
vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
|
||||
/* Count bits set (number of non-zero coefficients) in bitmap. */
|
||||
unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
|
||||
/* Move bitmap to 64-bit scalar register. */
|
||||
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||
|
||||
/* Set up state and bit buffer for output bitstream. */
|
||||
working_state *state_ptr = (working_state *)state;
|
||||
int free_bits = state_ptr->cur.free_bits;
|
||||
size_t put_buffer = state_ptr->cur.put_buffer;
|
||||
|
||||
/* Encode DC coefficient. */
|
||||
|
||||
/* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
|
||||
int16x8_t abs_row0 = vabsq_s16(row0);
|
||||
int16x8_t row0_lz = vclzq_s16(abs_row0);
|
||||
uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
|
||||
uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
|
||||
/* Find nbits required to specify sign and amplitude of coefficient. */
|
||||
unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
|
||||
unsigned int nbits = 16 - lz;
|
||||
/* Emit Huffman-coded symbol and additional diff bits. */
|
||||
unsigned int diff = vgetq_lane_u16(row0_diff, 0);
|
||||
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
|
||||
|
||||
/* Encode AC coefficients. */
|
||||
|
||||
unsigned int r = 0; /* r = run length of zeros */
|
||||
unsigned int i = 1; /* i = number of coefficients encoded */
|
||||
/* Code and size information for a run length of 16 zero coefficients */
|
||||
const unsigned int code_0xf0 = actbl->ehufco[0xf0];
|
||||
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
|
||||
|
||||
/* The most efficient method of computing nbits and diff depends on the
|
||||
* number of non-zero coefficients. If the bitmap is not too sparse (> 8
|
||||
* non-zero AC coefficients), it is beneficial to do all of the work using
|
||||
* Neon; else we do some of the work using Neon and the rest on demand using
|
||||
* scalar code.
|
||||
*/
|
||||
if (non_zero_coefficients > 8) {
|
||||
uint8_t block_nbits[DCTSIZE2];
|
||||
|
||||
int16x8_t abs_row1 = vabsq_s16(row1);
|
||||
int16x8_t abs_row2 = vabsq_s16(row2);
|
||||
int16x8_t abs_row3 = vabsq_s16(row3);
|
||||
int16x8_t abs_row4 = vabsq_s16(row4);
|
||||
int16x8_t abs_row5 = vabsq_s16(row5);
|
||||
int16x8_t abs_row6 = vabsq_s16(row6);
|
||||
int16x8_t abs_row7 = vabsq_s16(row7);
|
||||
int16x8_t row1_lz = vclzq_s16(abs_row1);
|
||||
int16x8_t row2_lz = vclzq_s16(abs_row2);
|
||||
int16x8_t row3_lz = vclzq_s16(abs_row3);
|
||||
int16x8_t row4_lz = vclzq_s16(abs_row4);
|
||||
int16x8_t row5_lz = vclzq_s16(abs_row5);
|
||||
int16x8_t row6_lz = vclzq_s16(abs_row6);
|
||||
int16x8_t row7_lz = vclzq_s16(abs_row7);
|
||||
/* Narrow leading zero count to 8 bits. */
|
||||
uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
|
||||
vreinterpretq_u8_s16(row1_lz));
|
||||
uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
|
||||
vreinterpretq_u8_s16(row3_lz));
|
||||
uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
|
||||
vreinterpretq_u8_s16(row5_lz));
|
||||
uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
|
||||
vreinterpretq_u8_s16(row7_lz));
|
||||
/* Compute nbits needed to specify magnitude of each coefficient. */
|
||||
uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
|
||||
uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
|
||||
uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
|
||||
uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
|
||||
/* Store nbits. */
|
||||
vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
|
||||
vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
|
||||
vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
|
||||
vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
|
||||
/* Mask bits not required to specify sign and amplitude of diff. */
|
||||
uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
|
||||
uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
|
||||
uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
|
||||
uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
|
||||
uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
|
||||
uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
|
||||
uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
|
||||
/* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
|
||||
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
|
||||
row1_mask);
|
||||
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
|
||||
row2_mask);
|
||||
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
|
||||
row3_mask);
|
||||
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
|
||||
row4_mask);
|
||||
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
|
||||
row5_mask);
|
||||
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
|
||||
row6_mask);
|
||||
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
|
||||
row7_mask);
|
||||
/* Store diff bits. */
|
||||
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
|
||||
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
|
||||
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
|
||||
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
|
||||
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
|
||||
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
|
||||
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
|
||||
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
|
||||
|
||||
while (bitmap != 0) {
|
||||
r = BUILTIN_CLZLL(bitmap);
|
||||
i += r;
|
||||
bitmap <<= r;
|
||||
nbits = block_nbits[i];
|
||||
diff = block_diff[i];
|
||||
while (r > 15) {
|
||||
/* If run length > 15, emit special run-length-16 codes. */
|
||||
PUT_BITS(code_0xf0, size_0xf0)
|
||||
r -= 16;
|
||||
}
|
||||
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
|
||||
unsigned int rs = (r << 4) + nbits;
|
||||
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
|
||||
i++;
|
||||
bitmap <<= 1;
|
||||
}
|
||||
} else if (bitmap != 0) {
|
||||
uint16_t block_abs[DCTSIZE2];
|
||||
/* Compute and store absolute value of coefficients. */
|
||||
int16x8_t abs_row1 = vabsq_s16(row1);
|
||||
int16x8_t abs_row2 = vabsq_s16(row2);
|
||||
int16x8_t abs_row3 = vabsq_s16(row3);
|
||||
int16x8_t abs_row4 = vabsq_s16(row4);
|
||||
int16x8_t abs_row5 = vabsq_s16(row5);
|
||||
int16x8_t abs_row6 = vabsq_s16(row6);
|
||||
int16x8_t abs_row7 = vabsq_s16(row7);
|
||||
vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
|
||||
vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
|
||||
vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
|
||||
vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
|
||||
vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
|
||||
vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
|
||||
vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
|
||||
vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
|
||||
/* Compute diff bits (without nbits mask) and store. */
|
||||
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
|
||||
vcltzq_s16(row1));
|
||||
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
|
||||
vcltzq_s16(row2));
|
||||
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
|
||||
vcltzq_s16(row3));
|
||||
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
|
||||
vcltzq_s16(row4));
|
||||
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
|
||||
vcltzq_s16(row5));
|
||||
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
|
||||
vcltzq_s16(row6));
|
||||
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
|
||||
vcltzq_s16(row7));
|
||||
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
|
||||
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
|
||||
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
|
||||
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
|
||||
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
|
||||
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
|
||||
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
|
||||
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
|
||||
|
||||
/* Same as above but must mask diff bits and compute nbits on demand. */
|
||||
while (bitmap != 0) {
|
||||
r = BUILTIN_CLZLL(bitmap);
|
||||
i += r;
|
||||
bitmap <<= r;
|
||||
lz = BUILTIN_CLZ(block_abs[i]);
|
||||
nbits = 32 - lz;
|
||||
diff = ((unsigned int)block_diff[i] << lz) >> lz;
|
||||
while (r > 15) {
|
||||
/* If run length > 15, emit special run-length-16 codes. */
|
||||
PUT_BITS(code_0xf0, size_0xf0)
|
||||
r -= 16;
|
||||
}
|
||||
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
|
||||
unsigned int rs = (r << 4) + nbits;
|
||||
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
|
||||
i++;
|
||||
bitmap <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
|
||||
* The value of RS for the EOB code is 0.
|
||||
*/
|
||||
if (i != 64) {
|
||||
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
|
||||
}
|
||||
|
||||
state_ptr->cur.put_buffer = put_buffer;
|
||||
state_ptr->cur.free_bits = free_bits;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
1053
android/extern/libjpeg-turbo/simd/arm/aarch64/jsimd.c
vendored
Normal file
1053
android/extern/libjpeg-turbo/simd/arm/aarch64/jsimd.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2254
android/extern/libjpeg-turbo/simd/arm/aarch64/jsimd_neon.S
vendored
Normal file
2254
android/extern/libjpeg-turbo/simd/arm/aarch64/jsimd_neon.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
28
android/extern/libjpeg-turbo/simd/arm/align.h
vendored
Normal file
28
android/extern/libjpeg-turbo/simd/arm/align.h
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* How to obtain memory alignment for structures and variables */
|
||||
#if defined(_MSC_VER)
|
||||
#define ALIGN(alignment) __declspec(align(alignment))
|
||||
#elif defined(__clang__) || defined(__GNUC__)
|
||||
#define ALIGN(alignment) __attribute__((aligned(alignment)))
|
||||
#else
|
||||
#error "Unknown compiler"
|
||||
#endif
|
||||
160
android/extern/libjpeg-turbo/simd/arm/jccolor-neon.c
vendored
Normal file
160
android/extern/libjpeg-turbo/simd/arm/jccolor-neon.c
vendored
Normal file
@@ -0,0 +1,160 @@
|
||||
/*
|
||||
* jccolor-neon.c - colorspace conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
#include "neon-compat.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* RGB -> YCbCr conversion constants */
|
||||
|
||||
#define F_0_298 19595
|
||||
#define F_0_587 38470
|
||||
#define F_0_113 7471
|
||||
#define F_0_168 11059
|
||||
#define F_0_331 21709
|
||||
#define F_0_500 32768
|
||||
#define F_0_418 27439
|
||||
#define F_0_081 5329
|
||||
|
||||
ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
|
||||
F_0_298, F_0_587, F_0_113, F_0_168,
|
||||
F_0_331, F_0_500, F_0_418, F_0_081
|
||||
};
|
||||
|
||||
|
||||
/* Include inline routines for colorspace extensions. */
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#include "aarch64/jccolext-neon.c"
|
||||
#else
|
||||
#include "aarch32/jccolext-neon.c"
|
||||
#endif
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_RED EXT_RGB_RED
|
||||
#define RGB_GREEN EXT_RGB_GREEN
|
||||
#define RGB_BLUE EXT_RGB_BLUE
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#include "aarch64/jccolext-neon.c"
|
||||
#else
|
||||
#include "aarch32/jccolext-neon.c"
|
||||
#endif
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_ycc_convert_neon
|
||||
|
||||
#define RGB_RED EXT_RGBX_RED
|
||||
#define RGB_GREEN EXT_RGBX_GREEN
|
||||
#define RGB_BLUE EXT_RGBX_BLUE
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#include "aarch64/jccolext-neon.c"
|
||||
#else
|
||||
#include "aarch32/jccolext-neon.c"
|
||||
#endif
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_ycc_convert_neon
|
||||
|
||||
#define RGB_RED EXT_BGR_RED
|
||||
#define RGB_GREEN EXT_BGR_GREEN
|
||||
#define RGB_BLUE EXT_BGR_BLUE
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#include "aarch64/jccolext-neon.c"
|
||||
#else
|
||||
#include "aarch32/jccolext-neon.c"
|
||||
#endif
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_ycc_convert_neon
|
||||
|
||||
#define RGB_RED EXT_BGRX_RED
|
||||
#define RGB_GREEN EXT_BGRX_GREEN
|
||||
#define RGB_BLUE EXT_BGRX_BLUE
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#include "aarch64/jccolext-neon.c"
|
||||
#else
|
||||
#include "aarch32/jccolext-neon.c"
|
||||
#endif
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_ycc_convert_neon
|
||||
|
||||
#define RGB_RED EXT_XBGR_RED
|
||||
#define RGB_GREEN EXT_XBGR_GREEN
|
||||
#define RGB_BLUE EXT_XBGR_BLUE
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#include "aarch64/jccolext-neon.c"
|
||||
#else
|
||||
#include "aarch32/jccolext-neon.c"
|
||||
#endif
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_ycc_convert_neon
|
||||
|
||||
#define RGB_RED EXT_XRGB_RED
|
||||
#define RGB_GREEN EXT_XRGB_GREEN
|
||||
#define RGB_BLUE EXT_XRGB_BLUE
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#include "aarch64/jccolext-neon.c"
|
||||
#else
|
||||
#include "aarch32/jccolext-neon.c"
|
||||
#endif
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_ycc_convert_neon
|
||||
120
android/extern/libjpeg-turbo/simd/arm/jcgray-neon.c
vendored
Normal file
120
android/extern/libjpeg-turbo/simd/arm/jcgray-neon.c
vendored
Normal file
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* RGB -> Grayscale conversion constants */
|
||||
|
||||
#define F_0_298 19595
|
||||
#define F_0_587 38470
|
||||
#define F_0_113 7471
|
||||
|
||||
|
||||
/* Include inline routines for colorspace extensions. */
|
||||
|
||||
#include "jcgryext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_RED EXT_RGB_RED
|
||||
#define RGB_GREEN EXT_RGB_GREEN
|
||||
#define RGB_BLUE EXT_RGB_BLUE
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon
|
||||
#include "jcgryext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_gray_convert_neon
|
||||
|
||||
#define RGB_RED EXT_RGBX_RED
|
||||
#define RGB_GREEN EXT_RGBX_GREEN
|
||||
#define RGB_BLUE EXT_RGBX_BLUE
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon
|
||||
#include "jcgryext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_gray_convert_neon
|
||||
|
||||
#define RGB_RED EXT_BGR_RED
|
||||
#define RGB_GREEN EXT_BGR_GREEN
|
||||
#define RGB_BLUE EXT_BGR_BLUE
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon
|
||||
#include "jcgryext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_gray_convert_neon
|
||||
|
||||
#define RGB_RED EXT_BGRX_RED
|
||||
#define RGB_GREEN EXT_BGRX_GREEN
|
||||
#define RGB_BLUE EXT_BGRX_BLUE
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon
|
||||
#include "jcgryext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_gray_convert_neon
|
||||
|
||||
#define RGB_RED EXT_XBGR_RED
|
||||
#define RGB_GREEN EXT_XBGR_GREEN
|
||||
#define RGB_BLUE EXT_XBGR_BLUE
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon
|
||||
#include "jcgryext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_gray_convert_neon
|
||||
|
||||
#define RGB_RED EXT_XRGB_RED
|
||||
#define RGB_GREEN EXT_XRGB_GREEN
|
||||
#define RGB_BLUE EXT_XRGB_BLUE
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon
|
||||
#include "jcgryext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_rgb_gray_convert_neon
|
||||
106
android/extern/libjpeg-turbo/simd/arm/jcgryext-neon.c
vendored
Normal file
106
android/extern/libjpeg-turbo/simd/arm/jcgryext-neon.c
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
* jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jcgray-neon.c */
|
||||
|
||||
|
||||
/* RGB -> Grayscale conversion is defined by the following equation:
|
||||
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
*
|
||||
* Avoid floating point arithmetic by using shifted integer constants:
|
||||
* 0.29899597 = 19595 * 2^-16
|
||||
* 0.58700561 = 38470 * 2^-16
|
||||
* 0.11399841 = 7471 * 2^-16
|
||||
* These constants are defined in jcgray-neon.c
|
||||
*
|
||||
* This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
|
||||
*/
|
||||
|
||||
void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
|
||||
JSAMPIMAGE output_buf, JDIMENSION output_row,
|
||||
int num_rows)
|
||||
{
|
||||
JSAMPROW inptr;
|
||||
JSAMPROW outptr;
|
||||
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
|
||||
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr = *input_buf++;
|
||||
outptr = output_buf[0][output_row];
|
||||
output_row++;
|
||||
|
||||
int cols_remaining = image_width;
|
||||
for (; cols_remaining > 0; cols_remaining -= 16) {
|
||||
|
||||
/* To prevent buffer overread by the vector load instructions, the last
|
||||
* (image_width % 16) columns of data are first memcopied to a temporary
|
||||
* buffer large enough to accommodate the vector load.
|
||||
*/
|
||||
if (cols_remaining < 16) {
|
||||
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
|
||||
inptr = tmp_buf;
|
||||
}
|
||||
|
||||
#if RGB_PIXELSIZE == 4
|
||||
uint8x16x4_t input_pixels = vld4q_u8(inptr);
|
||||
#else
|
||||
uint8x16x3_t input_pixels = vld3q_u8(inptr);
|
||||
#endif
|
||||
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
|
||||
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
|
||||
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
|
||||
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
|
||||
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
|
||||
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
|
||||
|
||||
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
|
||||
uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
|
||||
uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
|
||||
uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
|
||||
uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
|
||||
y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
|
||||
y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
|
||||
y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
|
||||
y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
|
||||
y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
|
||||
y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
|
||||
y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
|
||||
y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
|
||||
|
||||
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
|
||||
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
|
||||
vrshrn_n_u32(y_lh, 16));
|
||||
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
|
||||
vrshrn_n_u32(y_hh, 16));
|
||||
|
||||
/* Narrow Y values to 8-bit and store to memory. Buffer overwrite is
|
||||
* permitted up to the next multiple of ALIGN_SIZE bytes.
|
||||
*/
|
||||
vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
|
||||
|
||||
/* Increment pointers. */
|
||||
inptr += (16 * RGB_PIXELSIZE);
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
131
android/extern/libjpeg-turbo/simd/arm/jchuff.h
vendored
Normal file
131
android/extern/libjpeg-turbo/simd/arm/jchuff.h
vendored
Normal file
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
* jchuff.h
|
||||
*
|
||||
* This file was part of the Independent JPEG Group's software:
|
||||
* Copyright (C) 1991-1997, Thomas G. Lane.
|
||||
* libjpeg-turbo Modifications:
|
||||
* Copyright (C) 2009, 2018, 2021, D. R. Commander.
|
||||
* Copyright (C) 2018, Matthias Räncker.
|
||||
* Copyright (C) 2020-2021, Arm Limited.
|
||||
* For conditions of distribution and use, see the accompanying README.ijg
|
||||
* file.
|
||||
*/
|
||||
|
||||
/* Expanded entropy encoder object for Huffman encoding.
|
||||
*
|
||||
* The savable_state subrecord contains fields that change within an MCU,
|
||||
* but must not be updated permanently until we complete the MCU.
|
||||
*/
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define BIT_BUF_SIZE 64
|
||||
#else
|
||||
#define BIT_BUF_SIZE 32
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
size_t put_buffer; /* current bit accumulation buffer */
|
||||
int free_bits; /* # of bits available in it */
|
||||
int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
|
||||
} savable_state;
|
||||
|
||||
typedef struct {
|
||||
JOCTET *next_output_byte; /* => next byte to write in buffer */
|
||||
size_t free_in_buffer; /* # of byte spaces remaining in buffer */
|
||||
savable_state cur; /* Current bit buffer & DC state */
|
||||
j_compress_ptr cinfo; /* dump_buffer needs access to this */
|
||||
int simd;
|
||||
} working_state;
|
||||
|
||||
/* Outputting bits to the file */
|
||||
|
||||
/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
|
||||
* as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
|
||||
* 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the
|
||||
* speculative 0 byte will be overwritten by the next byte.
|
||||
*/
|
||||
#define EMIT_BYTE(b) { \
|
||||
buffer[0] = (JOCTET)(b); \
|
||||
buffer[1] = 0; \
|
||||
buffer -= -2 + ((JOCTET)(b) < 0xFF); \
|
||||
}
|
||||
|
||||
/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
|
||||
* directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
|
||||
* encode 0xFF as 0xFF 0x00.
|
||||
*/
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
|
||||
#define FLUSH() { \
|
||||
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
|
||||
EMIT_BYTE(put_buffer >> 56) \
|
||||
EMIT_BYTE(put_buffer >> 48) \
|
||||
EMIT_BYTE(put_buffer >> 40) \
|
||||
EMIT_BYTE(put_buffer >> 32) \
|
||||
EMIT_BYTE(put_buffer >> 24) \
|
||||
EMIT_BYTE(put_buffer >> 16) \
|
||||
EMIT_BYTE(put_buffer >> 8) \
|
||||
EMIT_BYTE(put_buffer ) \
|
||||
} else { \
|
||||
*((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
|
||||
buffer += 8; \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define SPLAT() { \
|
||||
buffer[0] = (JOCTET)(put_buffer >> 24); \
|
||||
buffer[1] = (JOCTET)(put_buffer >> 16); \
|
||||
buffer[2] = (JOCTET)(put_buffer >> 8); \
|
||||
buffer[3] = (JOCTET)(put_buffer ); \
|
||||
buffer += 4; \
|
||||
}
|
||||
#else
|
||||
#define SPLAT() { \
|
||||
put_buffer = __builtin_bswap32(put_buffer); \
|
||||
__asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#define FLUSH() { \
|
||||
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
|
||||
EMIT_BYTE(put_buffer >> 24) \
|
||||
EMIT_BYTE(put_buffer >> 16) \
|
||||
EMIT_BYTE(put_buffer >> 8) \
|
||||
EMIT_BYTE(put_buffer ) \
|
||||
} else { \
|
||||
SPLAT(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* Fill the bit buffer to capacity with the leading bits from code, then output
|
||||
* the bit buffer and put the remaining bits from code into the bit buffer.
|
||||
*/
|
||||
#define PUT_AND_FLUSH(code, size) { \
|
||||
put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
|
||||
FLUSH() \
|
||||
free_bits += BIT_BUF_SIZE; \
|
||||
put_buffer = code; \
|
||||
}
|
||||
|
||||
/* Insert code into the bit buffer and output the bit buffer if needed.
|
||||
* NOTE: We can't flush with free_bits == 0, since the left shift in
|
||||
* PUT_AND_FLUSH() would have undefined behavior.
|
||||
*/
|
||||
#define PUT_BITS(code, size) { \
|
||||
free_bits -= size; \
|
||||
if (free_bits < 0) \
|
||||
PUT_AND_FLUSH(code, size) \
|
||||
else \
|
||||
put_buffer = (put_buffer << size) | code; \
|
||||
}
|
||||
|
||||
#define PUT_CODE(code, size, diff) { \
|
||||
diff |= code << nbits; \
|
||||
nbits += size; \
|
||||
PUT_BITS(diff, nbits) \
|
||||
}
|
||||
623
android/extern/libjpeg-turbo/simd/arm/jcphuff-neon.c
vendored
Normal file
623
android/extern/libjpeg-turbo/simd/arm/jcphuff-neon.c
vendored
Normal file
@@ -0,0 +1,623 @@
|
||||
/*
|
||||
* jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2022, Matthieu Darbois. All Rights Reserved.
|
||||
* Copyright (C) 2022, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "neon-compat.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* Data preparation for encode_mcu_AC_first().
|
||||
*
|
||||
* The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
|
||||
* found in jcphuff.c.
|
||||
*/
|
||||
|
||||
void jsimd_encode_mcu_AC_first_prepare_neon
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
UJCOEF *values, size_t *zerobits)
|
||||
{
|
||||
UJCOEF *values_ptr = values;
|
||||
UJCOEF *diff_values_ptr = values + DCTSIZE2;
|
||||
|
||||
/* Rows of coefficients to zero (since they haven't been processed) */
|
||||
int i, rows_to_zero = 8;
|
||||
|
||||
for (i = 0; i < Sl / 16; i++) {
|
||||
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
|
||||
int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
|
||||
|
||||
/* Isolate sign of coefficients. */
|
||||
uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
|
||||
uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
|
||||
/* Compute absolute value of coefficients and apply point transform Al. */
|
||||
uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
|
||||
uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
|
||||
abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
|
||||
abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
|
||||
|
||||
/* Compute diff values. */
|
||||
uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
|
||||
uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
|
||||
|
||||
/* Store transformed coefficients and diff values. */
|
||||
vst1q_u16(values_ptr, abs_coefs1);
|
||||
vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
|
||||
vst1q_u16(diff_values_ptr, diff1);
|
||||
vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
|
||||
values_ptr += 16;
|
||||
diff_values_ptr += 16;
|
||||
jpeg_natural_order_start += 16;
|
||||
rows_to_zero -= 2;
|
||||
}
|
||||
|
||||
/* Same operation but for remaining partial vector */
|
||||
int remaining_coefs = Sl % 16;
|
||||
if (remaining_coefs > 8) {
|
||||
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
|
||||
int16x8_t coefs2 = vdupq_n_s16(0);
|
||||
switch (remaining_coefs) {
|
||||
case 15:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 14:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 13:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 12:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 11:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 10:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 9:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/* Isolate sign of coefficients. */
|
||||
uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
|
||||
uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
|
||||
/* Compute absolute value of coefficients and apply point transform Al. */
|
||||
uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
|
||||
uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
|
||||
abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
|
||||
abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
|
||||
|
||||
/* Compute diff values. */
|
||||
uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
|
||||
uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
|
||||
|
||||
/* Store transformed coefficients and diff values. */
|
||||
vst1q_u16(values_ptr, abs_coefs1);
|
||||
vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
|
||||
vst1q_u16(diff_values_ptr, diff1);
|
||||
vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
|
||||
values_ptr += 16;
|
||||
diff_values_ptr += 16;
|
||||
rows_to_zero -= 2;
|
||||
|
||||
} else if (remaining_coefs > 0) {
|
||||
int16x8_t coefs = vdupq_n_s16(0);
|
||||
|
||||
switch (remaining_coefs) {
|
||||
case 8:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 7:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/* Isolate sign of coefficients. */
|
||||
uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
|
||||
/* Compute absolute value of coefficients and apply point transform Al. */
|
||||
uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
|
||||
abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
|
||||
|
||||
/* Compute diff values. */
|
||||
uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
|
||||
|
||||
/* Store transformed coefficients and diff values. */
|
||||
vst1q_u16(values_ptr, abs_coefs);
|
||||
vst1q_u16(diff_values_ptr, diff);
|
||||
values_ptr += 8;
|
||||
diff_values_ptr += 8;
|
||||
rows_to_zero--;
|
||||
}
|
||||
|
||||
/* Zero remaining memory in the values and diff_values blocks. */
|
||||
for (i = 0; i < rows_to_zero; i++) {
|
||||
vst1q_u16(values_ptr, vdupq_n_u16(0));
|
||||
vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
|
||||
values_ptr += 8;
|
||||
diff_values_ptr += 8;
|
||||
}
|
||||
|
||||
/* Construct zerobits bitmap. A set bit means that the corresponding
|
||||
* coefficient != 0.
|
||||
*/
|
||||
uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
|
||||
uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
|
||||
uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
|
||||
uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
|
||||
uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
|
||||
uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
|
||||
uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
|
||||
uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
|
||||
|
||||
uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
|
||||
uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
|
||||
uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
|
||||
uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
|
||||
uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
|
||||
uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
|
||||
uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
|
||||
uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
|
||||
|
||||
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
|
||||
const uint8x8_t bitmap_mask =
|
||||
vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
|
||||
|
||||
row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
|
||||
row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
|
||||
row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
|
||||
row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
|
||||
row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
|
||||
row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
|
||||
row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
|
||||
row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
|
||||
|
||||
uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
|
||||
uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
|
||||
uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
|
||||
uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
|
||||
uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
|
||||
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
||||
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
/* Move bitmap to a 64-bit scalar register. */
|
||||
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||
/* Store zerobits bitmap. */
|
||||
*zerobits = ~bitmap;
|
||||
#else
|
||||
/* Move bitmap to two 32-bit scalar registers. */
|
||||
uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
|
||||
uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
|
||||
/* Store zerobits bitmap. */
|
||||
zerobits[0] = ~bitmap0;
|
||||
zerobits[1] = ~bitmap1;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* Data preparation for encode_mcu_AC_refine().
|
||||
*
|
||||
* The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
|
||||
* found in jcphuff.c.
|
||||
*/
|
||||
|
||||
int jsimd_encode_mcu_AC_refine_prepare_neon
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
UJCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
/* Temporary storage buffers for data used to compute the signbits bitmap and
|
||||
* the end-of-block (EOB) position
|
||||
*/
|
||||
uint8_t coef_sign_bits[64];
|
||||
uint8_t coef_eq1_bits[64];
|
||||
|
||||
UJCOEF *absvalues_ptr = absvalues;
|
||||
uint8_t *coef_sign_bits_ptr = coef_sign_bits;
|
||||
uint8_t *eq1_bits_ptr = coef_eq1_bits;
|
||||
|
||||
/* Rows of coefficients to zero (since they haven't been processed) */
|
||||
int i, rows_to_zero = 8;
|
||||
|
||||
for (i = 0; i < Sl / 16; i++) {
|
||||
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
|
||||
int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
|
||||
|
||||
/* Compute and store data for signbits bitmap. */
|
||||
uint8x8_t sign_coefs1 =
|
||||
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
|
||||
uint8x8_t sign_coefs2 =
|
||||
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
|
||||
vst1_u8(coef_sign_bits_ptr, sign_coefs1);
|
||||
vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
|
||||
|
||||
/* Compute absolute value of coefficients and apply point transform Al. */
|
||||
uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
|
||||
uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
|
||||
abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
|
||||
abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
|
||||
vst1q_u16(absvalues_ptr, abs_coefs1);
|
||||
vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
|
||||
|
||||
/* Test whether transformed coefficient values == 1 (used to find EOB
|
||||
* position.)
|
||||
*/
|
||||
uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
|
||||
uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
|
||||
vst1_u8(eq1_bits_ptr, coefs_eq11);
|
||||
vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
|
||||
|
||||
absvalues_ptr += 16;
|
||||
coef_sign_bits_ptr += 16;
|
||||
eq1_bits_ptr += 16;
|
||||
jpeg_natural_order_start += 16;
|
||||
rows_to_zero -= 2;
|
||||
}
|
||||
|
||||
/* Same operation but for remaining partial vector */
|
||||
int remaining_coefs = Sl % 16;
|
||||
if (remaining_coefs > 8) {
|
||||
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
|
||||
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
|
||||
int16x8_t coefs2 = vdupq_n_s16(0);
|
||||
switch (remaining_coefs) {
|
||||
case 15:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 14:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 13:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 12:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 11:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 10:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 9:
|
||||
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/* Compute and store data for signbits bitmap. */
|
||||
uint8x8_t sign_coefs1 =
|
||||
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
|
||||
uint8x8_t sign_coefs2 =
|
||||
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
|
||||
vst1_u8(coef_sign_bits_ptr, sign_coefs1);
|
||||
vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
|
||||
|
||||
/* Compute absolute value of coefficients and apply point transform Al. */
|
||||
uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
|
||||
uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
|
||||
abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
|
||||
abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
|
||||
vst1q_u16(absvalues_ptr, abs_coefs1);
|
||||
vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
|
||||
|
||||
/* Test whether transformed coefficient values == 1 (used to find EOB
|
||||
* position.)
|
||||
*/
|
||||
uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
|
||||
uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
|
||||
vst1_u8(eq1_bits_ptr, coefs_eq11);
|
||||
vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
|
||||
|
||||
absvalues_ptr += 16;
|
||||
coef_sign_bits_ptr += 16;
|
||||
eq1_bits_ptr += 16;
|
||||
jpeg_natural_order_start += 16;
|
||||
rows_to_zero -= 2;
|
||||
|
||||
} else if (remaining_coefs > 0) {
|
||||
int16x8_t coefs = vdupq_n_s16(0);
|
||||
|
||||
switch (remaining_coefs) {
|
||||
case 8:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 7:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/* Compute and store data for signbits bitmap. */
|
||||
uint8x8_t sign_coefs =
|
||||
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
|
||||
vst1_u8(coef_sign_bits_ptr, sign_coefs);
|
||||
|
||||
/* Compute absolute value of coefficients and apply point transform Al. */
|
||||
uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
|
||||
abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
|
||||
vst1q_u16(absvalues_ptr, abs_coefs);
|
||||
|
||||
/* Test whether transformed coefficient values == 1 (used to find EOB
|
||||
* position.)
|
||||
*/
|
||||
uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
|
||||
vst1_u8(eq1_bits_ptr, coefs_eq1);
|
||||
|
||||
absvalues_ptr += 8;
|
||||
coef_sign_bits_ptr += 8;
|
||||
eq1_bits_ptr += 8;
|
||||
rows_to_zero--;
|
||||
}
|
||||
|
||||
/* Zero remaining memory in blocks. */
|
||||
for (i = 0; i < rows_to_zero; i++) {
|
||||
vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
|
||||
vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
|
||||
vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
|
||||
absvalues_ptr += 8;
|
||||
coef_sign_bits_ptr += 8;
|
||||
eq1_bits_ptr += 8;
|
||||
}
|
||||
|
||||
/* Construct zerobits bitmap. */
|
||||
uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
|
||||
uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
|
||||
uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
|
||||
uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
|
||||
uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
|
||||
uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
|
||||
uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
|
||||
uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
|
||||
|
||||
uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
|
||||
uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
|
||||
uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
|
||||
uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
|
||||
uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
|
||||
uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
|
||||
uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
|
||||
uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
|
||||
|
||||
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
|
||||
const uint8x8_t bitmap_mask =
|
||||
vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
|
||||
|
||||
abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
|
||||
abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
|
||||
abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
|
||||
abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
|
||||
abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
|
||||
abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
|
||||
abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
|
||||
abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
|
||||
|
||||
uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
|
||||
uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
|
||||
uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
|
||||
uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
|
||||
uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
|
||||
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
||||
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
/* Move bitmap to a 64-bit scalar register. */
|
||||
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||
/* Store zerobits bitmap. */
|
||||
bits[0] = ~bitmap;
|
||||
#else
|
||||
/* Move bitmap to two 32-bit scalar registers. */
|
||||
uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
|
||||
uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
|
||||
/* Store zerobits bitmap. */
|
||||
bits[0] = ~bitmap0;
|
||||
bits[1] = ~bitmap1;
|
||||
#endif
|
||||
|
||||
/* Construct signbits bitmap. */
|
||||
uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
|
||||
uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
|
||||
uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
|
||||
uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
|
||||
uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
|
||||
uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
|
||||
uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
|
||||
uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
|
||||
|
||||
signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
|
||||
signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
|
||||
signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
|
||||
signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
|
||||
signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
|
||||
signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
|
||||
signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
|
||||
signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
|
||||
|
||||
bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
|
||||
bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
|
||||
bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
|
||||
bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
|
||||
bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
|
||||
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
||||
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
/* Move bitmap to a 64-bit scalar register. */
|
||||
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||
/* Store signbits bitmap. */
|
||||
bits[1] = ~bitmap;
|
||||
#else
|
||||
/* Move bitmap to two 32-bit scalar registers. */
|
||||
bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
|
||||
bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
|
||||
/* Store signbits bitmap. */
|
||||
bits[2] = ~bitmap0;
|
||||
bits[3] = ~bitmap1;
|
||||
#endif
|
||||
|
||||
/* Construct bitmap to find EOB position (the index of the last coefficient
|
||||
* equal to 1.)
|
||||
*/
|
||||
uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
|
||||
uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
|
||||
uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
|
||||
uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
|
||||
uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
|
||||
uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
|
||||
uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
|
||||
uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
|
||||
|
||||
row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
|
||||
row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
|
||||
row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
|
||||
row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
|
||||
row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
|
||||
row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
|
||||
row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
|
||||
row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
|
||||
|
||||
bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
|
||||
bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
|
||||
bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
|
||||
bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
|
||||
bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
|
||||
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
|
||||
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
/* Move bitmap to a 64-bit scalar register. */
|
||||
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
|
||||
|
||||
/* Return EOB position. */
|
||||
if (bitmap == 0) {
|
||||
/* EOB position is defined to be 0 if all coefficients != 1. */
|
||||
return 0;
|
||||
} else {
|
||||
return 63 - BUILTIN_CLZLL(bitmap);
|
||||
}
|
||||
#else
|
||||
/* Move bitmap to two 32-bit scalar registers. */
|
||||
bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
|
||||
bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
|
||||
|
||||
/* Return EOB position. */
|
||||
if (bitmap0 == 0 && bitmap1 == 0) {
|
||||
return 0;
|
||||
} else if (bitmap1 != 0) {
|
||||
return 63 - BUILTIN_CLZ(bitmap1);
|
||||
} else {
|
||||
return 31 - BUILTIN_CLZ(bitmap0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
192
android/extern/libjpeg-turbo/simd/arm/jcsample-neon.c
vendored
Normal file
192
android/extern/libjpeg-turbo/simd/arm/jcsample-neon.c
vendored
Normal file
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
* jcsample-neon.c - downsampling (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
|
||||
0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
|
||||
0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
|
||||
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
|
||||
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
|
||||
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
|
||||
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
|
||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
|
||||
0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
|
||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
|
||||
0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
|
||||
/* Downsample pixel values of a single component.
|
||||
* This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
* without smoothing.
|
||||
*/
|
||||
|
||||
void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
|
||||
JDIMENSION v_samp_factor,
|
||||
JDIMENSION width_in_blocks,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
JSAMPROW inptr, outptr;
|
||||
/* Load expansion mask to pad remaining elements of last DCT block. */
|
||||
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
|
||||
const uint8x16_t expand_mask =
|
||||
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
|
||||
/* Load bias pattern (alternating every pixel.) */
|
||||
/* { 0, 1, 0, 1, 0, 1, 0, 1 } */
|
||||
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
|
||||
unsigned i, outrow;
|
||||
|
||||
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
||||
outptr = output_data[outrow];
|
||||
inptr = input_data[outrow];
|
||||
|
||||
/* Downsample all but the last DCT block of pixels. */
|
||||
for (i = 0; i < width_in_blocks - 1; i++) {
|
||||
uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
|
||||
/* Add adjacent pixel values, widen to 16-bit, and add bias. */
|
||||
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
|
||||
/* Divide total by 2 and narrow to 8-bit. */
|
||||
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
|
||||
/* Store samples to memory. */
|
||||
vst1_u8(outptr + i * DCTSIZE, samples_u8);
|
||||
}
|
||||
|
||||
/* Load pixels in last DCT block into a table. */
|
||||
uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
/* Pad the empty elements with the value of the last pixel. */
|
||||
pixels = vqtbl1q_u8(pixels, expand_mask);
|
||||
#else
|
||||
uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
|
||||
pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
|
||||
vtbl2_u8(table, vget_high_u8(expand_mask)));
|
||||
#endif
|
||||
/* Add adjacent pixel values, widen to 16-bit, and add bias. */
|
||||
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
|
||||
/* Divide total by 2, narrow to 8-bit, and store. */
|
||||
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
|
||||
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Downsample pixel values of a single component.
|
||||
* This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
* without smoothing.
|
||||
*/
|
||||
|
||||
void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
|
||||
JDIMENSION v_samp_factor,
|
||||
JDIMENSION width_in_blocks,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
JSAMPROW inptr0, inptr1, outptr;
|
||||
/* Load expansion mask to pad remaining elements of last DCT block. */
|
||||
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
|
||||
const uint8x16_t expand_mask =
|
||||
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
|
||||
/* Load bias pattern (alternating every pixel.) */
|
||||
/* { 1, 2, 1, 2, 1, 2, 1, 2 } */
|
||||
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
|
||||
unsigned i, outrow;
|
||||
|
||||
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
||||
outptr = output_data[outrow];
|
||||
inptr0 = input_data[outrow];
|
||||
inptr1 = input_data[outrow + 1];
|
||||
|
||||
/* Downsample all but the last DCT block of pixels. */
|
||||
for (i = 0; i < width_in_blocks - 1; i++) {
|
||||
uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
|
||||
uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
|
||||
/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
|
||||
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
|
||||
/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
|
||||
*/
|
||||
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
|
||||
/* Divide total by 4 and narrow to 8-bit. */
|
||||
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
|
||||
/* Store samples to memory and increment pointers. */
|
||||
vst1_u8(outptr + i * DCTSIZE, samples_u8);
|
||||
}
|
||||
|
||||
/* Load pixels in last DCT block into a table. */
|
||||
uint8x16_t pixels_r0 =
|
||||
vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||
uint8x16_t pixels_r1 =
|
||||
vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
/* Pad the empty elements with the value of the last pixel. */
|
||||
pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
|
||||
pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
|
||||
#else
|
||||
uint8x8x2_t table_r0 =
|
||||
{ { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
|
||||
uint8x8x2_t table_r1 =
|
||||
{ { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
|
||||
pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
|
||||
vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
|
||||
pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
|
||||
vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
|
||||
#endif
|
||||
/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
|
||||
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
|
||||
/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
|
||||
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
|
||||
/* Divide total by 4, narrow to 8-bit, and store. */
|
||||
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
|
||||
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
|
||||
}
|
||||
}
|
||||
374
android/extern/libjpeg-turbo/simd/arm/jdcolext-neon.c
vendored
Normal file
374
android/extern/libjpeg-turbo/simd/arm/jdcolext-neon.c
vendored
Normal file
@@ -0,0 +1,374 @@
|
||||
/*
|
||||
* jdcolext-neon.c - colorspace conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jdcolor-neon.c. */
|
||||
|
||||
|
||||
/* YCbCr -> RGB conversion is defined by the following equations:
|
||||
* R = Y + 1.40200 * (Cr - 128)
|
||||
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
|
||||
* B = Y + 1.77200 * (Cb - 128)
|
||||
*
|
||||
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||
* 0.3441467 = 11277 * 2^-15
|
||||
* 0.7141418 = 23401 * 2^-15
|
||||
* 1.4020386 = 22971 * 2^-14
|
||||
* 1.7720337 = 29033 * 2^-14
|
||||
* These constants are defined in jdcolor-neon.c.
|
||||
*
|
||||
* To ensure correct results, rounding is used when descaling.
|
||||
*/
|
||||
|
||||
/* Notes on safe memory access for YCbCr -> RGB conversion routines:
|
||||
*
|
||||
* Input memory buffers can be safely overread up to the next multiple of
|
||||
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
|
||||
* jmemmgr.c.
|
||||
*
|
||||
* The output buffer cannot safely be written beyond output_width, since
|
||||
* output_buf points to a possibly unpadded row in the decompressed image
|
||||
* buffer allocated by the calling program.
|
||||
*/
|
||||
|
||||
void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
|
||||
JDIMENSION input_row, JSAMPARRAY output_buf,
|
||||
int num_rows)
|
||||
{
|
||||
JSAMPROW outptr;
|
||||
/* Pointers to Y, Cb, and Cr data */
|
||||
JSAMPROW inptr0, inptr1, inptr2;
|
||||
|
||||
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
|
||||
const int16x8_t neg_128 = vdupq_n_s16(-128);
|
||||
|
||||
while (--num_rows >= 0) {
|
||||
inptr0 = input_buf[0][input_row];
|
||||
inptr1 = input_buf[1][input_row];
|
||||
inptr2 = input_buf[2][input_row];
|
||||
input_row++;
|
||||
outptr = *output_buf++;
|
||||
int cols_remaining = output_width;
|
||||
for (; cols_remaining >= 16; cols_remaining -= 16) {
|
||||
uint8x16_t y = vld1q_u8(inptr0);
|
||||
uint8x16_t cb = vld1q_u8(inptr1);
|
||||
uint8x16_t cr = vld1q_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
|
||||
vget_low_u8(cr)));
|
||||
int16x8_t cr_128_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
|
||||
vget_high_u8(cr)));
|
||||
int16x8_t cb_128_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
|
||||
vget_low_u8(cb)));
|
||||
int16x8_t cb_128_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
|
||||
vget_high_u8(cb)));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
|
||||
int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
|
||||
consts, 0);
|
||||
int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
|
||||
int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
|
||||
consts, 0);
|
||||
g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
|
||||
consts, 1);
|
||||
g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
|
||||
consts, 1);
|
||||
g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
|
||||
consts, 1);
|
||||
g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
|
||||
consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
|
||||
vrshrn_n_s32(g_sub_y_lh, 15));
|
||||
int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
|
||||
vrshrn_n_s32(g_sub_y_hh, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
|
||||
consts, 2);
|
||||
int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
|
||||
consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
|
||||
consts, 3);
|
||||
int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
|
||||
consts, 3);
|
||||
/* Add Y. */
|
||||
int16x8_t r_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
|
||||
vget_low_u8(y)));
|
||||
int16x8_t r_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
|
||||
vget_high_u8(y)));
|
||||
int16x8_t b_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
|
||||
vget_low_u8(y)));
|
||||
int16x8_t b_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
|
||||
vget_high_u8(y)));
|
||||
int16x8_t g_l =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
|
||||
vget_low_u8(y)));
|
||||
int16x8_t g_h =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
|
||||
vget_high_u8(y)));
|
||||
|
||||
#if RGB_PIXELSIZE == 4
|
||||
uint8x16x4_t rgba;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
|
||||
rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
|
||||
rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
vst4q_u8(outptr, rgba);
|
||||
#elif RGB_PIXELSIZE == 3
|
||||
uint8x16x3_t rgb;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
|
||||
rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
|
||||
rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
|
||||
/* Store RGB pixel data to memory. */
|
||||
vst3q_u8(outptr, rgb);
|
||||
#else
|
||||
/* Pack R, G, and B values in ratio 5:6:5. */
|
||||
uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
|
||||
rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
|
||||
rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
|
||||
uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
|
||||
rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
|
||||
rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
|
||||
/* Store RGB pixel data to memory. */
|
||||
vst1q_u16((uint16_t *)outptr, rgb565_l);
|
||||
vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
|
||||
#endif
|
||||
|
||||
/* Increment pointers. */
|
||||
inptr0 += 16;
|
||||
inptr1 += 16;
|
||||
inptr2 += 16;
|
||||
outptr += (RGB_PIXELSIZE * 16);
|
||||
}
|
||||
|
||||
if (cols_remaining >= 8) {
|
||||
uint8x8_t y = vld1_u8(inptr0);
|
||||
uint8x8_t cb = vld1_u8(inptr1);
|
||||
uint8x8_t cr = vld1_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||
int16x8_t cb_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||
vrshrn_n_s32(g_sub_y_h, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
|
||||
consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
|
||||
consts, 3);
|
||||
/* Add Y. */
|
||||
int16x8_t r =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
|
||||
int16x8_t b =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
|
||||
int16x8_t g =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
|
||||
|
||||
#if RGB_PIXELSIZE == 4
|
||||
uint8x8x4_t rgba;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgba.val[RGB_RED] = vqmovun_s16(r);
|
||||
rgba.val[RGB_GREEN] = vqmovun_s16(g);
|
||||
rgba.val[RGB_BLUE] = vqmovun_s16(b);
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
vst4_u8(outptr, rgba);
|
||||
#elif RGB_PIXELSIZE == 3
|
||||
uint8x8x3_t rgb;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgb.val[RGB_RED] = vqmovun_s16(r);
|
||||
rgb.val[RGB_GREEN] = vqmovun_s16(g);
|
||||
rgb.val[RGB_BLUE] = vqmovun_s16(b);
|
||||
/* Store RGB pixel data to memory. */
|
||||
vst3_u8(outptr, rgb);
|
||||
#else
|
||||
/* Pack R, G, and B values in ratio 5:6:5. */
|
||||
uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
|
||||
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
|
||||
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
|
||||
/* Store RGB pixel data to memory. */
|
||||
vst1q_u16((uint16_t *)outptr, rgb565);
|
||||
#endif
|
||||
|
||||
/* Increment pointers. */
|
||||
inptr0 += 8;
|
||||
inptr1 += 8;
|
||||
inptr2 += 8;
|
||||
outptr += (RGB_PIXELSIZE * 8);
|
||||
cols_remaining -= 8;
|
||||
}
|
||||
|
||||
/* Handle the tail elements. */
|
||||
if (cols_remaining > 0) {
|
||||
uint8x8_t y = vld1_u8(inptr0);
|
||||
uint8x8_t cb = vld1_u8(inptr1);
|
||||
uint8x8_t cr = vld1_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||
int16x8_t cb_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||
vrshrn_n_s32(g_sub_y_h, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
|
||||
consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
|
||||
consts, 3);
|
||||
/* Add Y. */
|
||||
int16x8_t r =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
|
||||
int16x8_t b =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
|
||||
int16x8_t g =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
|
||||
|
||||
#if RGB_PIXELSIZE == 4
|
||||
uint8x8x4_t rgba;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgba.val[RGB_RED] = vqmovun_s16(r);
|
||||
rgba.val[RGB_GREEN] = vqmovun_s16(g);
|
||||
rgba.val[RGB_BLUE] = vqmovun_s16(b);
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 7:
|
||||
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
vst4_lane_u8(outptr, rgba, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#elif RGB_PIXELSIZE == 3
|
||||
uint8x8x3_t rgb;
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
|
||||
rgb.val[RGB_RED] = vqmovun_s16(r);
|
||||
rgb.val[RGB_GREEN] = vqmovun_s16(g);
|
||||
rgb.val[RGB_BLUE] = vqmovun_s16(b);
|
||||
/* Store RGB pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 7:
|
||||
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
vst3_lane_u8(outptr, rgb, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#else
|
||||
/* Pack R, G, and B values in ratio 5:6:5. */
|
||||
uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
|
||||
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
|
||||
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
|
||||
/* Store RGB565 pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 7:
|
||||
vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
141
android/extern/libjpeg-turbo/simd/arm/jdcolor-neon.c
vendored
Normal file
141
android/extern/libjpeg-turbo/simd/arm/jdcolor-neon.c
vendored
Normal file
@@ -0,0 +1,141 @@
|
||||
/*
|
||||
* jdcolor-neon.c - colorspace conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* YCbCr -> RGB conversion constants */
|
||||
|
||||
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
|
||||
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
|
||||
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
|
||||
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
|
||||
-F_0_344, F_0_714, F_1_402, F_1_772
|
||||
};
|
||||
|
||||
|
||||
/* Include inline routines for colorspace extensions. */
|
||||
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_RED EXT_RGB_RED
|
||||
#define RGB_GREEN EXT_RGB_GREEN
|
||||
#define RGB_BLUE EXT_RGB_BLUE
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_RGBX_RED
|
||||
#define RGB_GREEN EXT_RGBX_GREEN
|
||||
#define RGB_BLUE EXT_RGBX_BLUE
|
||||
#define RGB_ALPHA 3
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_BGR_RED
|
||||
#define RGB_GREEN EXT_BGR_GREEN
|
||||
#define RGB_BLUE EXT_BGR_BLUE
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_BGRX_RED
|
||||
#define RGB_GREEN EXT_BGRX_GREEN
|
||||
#define RGB_BLUE EXT_BGRX_BLUE
|
||||
#define RGB_ALPHA 3
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_XBGR_RED
|
||||
#define RGB_GREEN EXT_XBGR_GREEN
|
||||
#define RGB_BLUE EXT_XBGR_BLUE
|
||||
#define RGB_ALPHA 0
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
#define RGB_RED EXT_XRGB_RED
|
||||
#define RGB_GREEN EXT_XRGB_GREEN
|
||||
#define RGB_BLUE EXT_XRGB_BLUE
|
||||
#define RGB_ALPHA 0
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
|
||||
/* YCbCr -> RGB565 Conversion */
|
||||
|
||||
#define RGB_PIXELSIZE 2
|
||||
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
|
||||
#include "jdcolext-neon.c"
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_ycc_rgb_convert_neon
|
||||
144
android/extern/libjpeg-turbo/simd/arm/jdmerge-neon.c
vendored
Normal file
144
android/extern/libjpeg-turbo/simd/arm/jdmerge-neon.c
vendored
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* YCbCr -> RGB conversion constants */
|
||||
|
||||
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
|
||||
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
|
||||
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
|
||||
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
|
||||
-F_0_344, F_0_714, F_1_402, F_1_772
|
||||
};
|
||||
|
||||
|
||||
/* Include inline routines for colorspace extensions. */
|
||||
|
||||
#include "jdmrgext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
|
||||
#define RGB_RED EXT_RGB_RED
|
||||
#define RGB_GREEN EXT_RGB_GREEN
|
||||
#define RGB_BLUE EXT_RGB_BLUE
|
||||
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon
|
||||
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon
|
||||
#include "jdmrgext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_h2v1_merged_upsample_neon
|
||||
#undef jsimd_h2v2_merged_upsample_neon
|
||||
|
||||
#define RGB_RED EXT_RGBX_RED
|
||||
#define RGB_GREEN EXT_RGBX_GREEN
|
||||
#define RGB_BLUE EXT_RGBX_BLUE
|
||||
#define RGB_ALPHA 3
|
||||
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon
|
||||
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon
|
||||
#include "jdmrgext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_h2v1_merged_upsample_neon
|
||||
#undef jsimd_h2v2_merged_upsample_neon
|
||||
|
||||
#define RGB_RED EXT_BGR_RED
|
||||
#define RGB_GREEN EXT_BGR_GREEN
|
||||
#define RGB_BLUE EXT_BGR_BLUE
|
||||
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon
|
||||
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon
|
||||
#include "jdmrgext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_h2v1_merged_upsample_neon
|
||||
#undef jsimd_h2v2_merged_upsample_neon
|
||||
|
||||
#define RGB_RED EXT_BGRX_RED
|
||||
#define RGB_GREEN EXT_BGRX_GREEN
|
||||
#define RGB_BLUE EXT_BGRX_BLUE
|
||||
#define RGB_ALPHA 3
|
||||
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon
|
||||
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon
|
||||
#include "jdmrgext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_h2v1_merged_upsample_neon
|
||||
#undef jsimd_h2v2_merged_upsample_neon
|
||||
|
||||
#define RGB_RED EXT_XBGR_RED
|
||||
#define RGB_GREEN EXT_XBGR_GREEN
|
||||
#define RGB_BLUE EXT_XBGR_BLUE
|
||||
#define RGB_ALPHA 0
|
||||
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon
|
||||
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon
|
||||
#include "jdmrgext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_h2v1_merged_upsample_neon
|
||||
#undef jsimd_h2v2_merged_upsample_neon
|
||||
|
||||
#define RGB_RED EXT_XRGB_RED
|
||||
#define RGB_GREEN EXT_XRGB_GREEN
|
||||
#define RGB_BLUE EXT_XRGB_BLUE
|
||||
#define RGB_ALPHA 0
|
||||
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon
|
||||
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon
|
||||
#include "jdmrgext-neon.c"
|
||||
#undef RGB_RED
|
||||
#undef RGB_GREEN
|
||||
#undef RGB_BLUE
|
||||
#undef RGB_ALPHA
|
||||
#undef RGB_PIXELSIZE
|
||||
#undef jsimd_h2v1_merged_upsample_neon
|
||||
723
android/extern/libjpeg-turbo/simd/arm/jdmrgext-neon.c
vendored
Normal file
723
android/extern/libjpeg-turbo/simd/arm/jdmrgext-neon.c
vendored
Normal file
@@ -0,0 +1,723 @@
|
||||
/*
|
||||
* jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* This file is included by jdmerge-neon.c. */
|
||||
|
||||
|
||||
/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
|
||||
* chroma upsampling and YCbCr -> RGB color conversion into a single function.
|
||||
*
|
||||
* As with the standalone functions, YCbCr -> RGB conversion is defined by the
|
||||
* following equations:
|
||||
* R = Y + 1.40200 * (Cr - 128)
|
||||
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
|
||||
* B = Y + 1.77200 * (Cb - 128)
|
||||
*
|
||||
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||
* 0.3441467 = 11277 * 2^-15
|
||||
* 0.7141418 = 23401 * 2^-15
|
||||
* 1.4020386 = 22971 * 2^-14
|
||||
* 1.7720337 = 29033 * 2^-14
|
||||
* These constants are defined in jdmerge-neon.c.
|
||||
*
|
||||
* To ensure correct results, rounding is used when descaling.
|
||||
*/
|
||||
|
||||
/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
|
||||
* routines:
|
||||
*
|
||||
* Input memory buffers can be safely overread up to the next multiple of
|
||||
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
|
||||
* jmemmgr.c.
|
||||
*
|
||||
* The output buffer cannot safely be written beyond output_width, since
|
||||
* output_buf points to a possibly unpadded row in the decompressed image
|
||||
* buffer allocated by the calling program.
|
||||
*/
|
||||
|
||||
/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||
*/
|
||||
|
||||
void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
|
||||
JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr,
|
||||
JSAMPARRAY output_buf)
|
||||
{
|
||||
JSAMPROW outptr;
|
||||
/* Pointers to Y, Cb, and Cr data */
|
||||
JSAMPROW inptr0, inptr1, inptr2;
|
||||
|
||||
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
|
||||
const int16x8_t neg_128 = vdupq_n_s16(-128);
|
||||
|
||||
inptr0 = input_buf[0][in_row_group_ctr];
|
||||
inptr1 = input_buf[1][in_row_group_ctr];
|
||||
inptr2 = input_buf[2][in_row_group_ctr];
|
||||
outptr = output_buf[0];
|
||||
|
||||
int cols_remaining = output_width;
|
||||
for (; cols_remaining >= 16; cols_remaining -= 16) {
|
||||
/* De-interleave Y component values into two separate vectors, one
|
||||
* containing the component values with even-numbered indices and one
|
||||
* containing the component values with odd-numbered indices.
|
||||
*/
|
||||
uint8x8x2_t y = vld2_u8(inptr0);
|
||||
uint8x8_t cb = vld1_u8(inptr1);
|
||||
uint8x8_t cr = vld1_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||
int16x8_t cb_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||
vrshrn_n_s32(g_sub_y_h, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
|
||||
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
|
||||
* "odd" Y component values. This effectively upsamples the chroma
|
||||
* components horizontally.
|
||||
*/
|
||||
int16x8_t g_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y.val[0]));
|
||||
int16x8_t r_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y.val[0]));
|
||||
int16x8_t b_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y.val[0]));
|
||||
int16x8_t g_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y.val[1]));
|
||||
int16x8_t r_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y.val[1]));
|
||||
int16x8_t b_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y.val[1]));
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255].
|
||||
* Re-interleave the "even" and "odd" component values.
|
||||
*/
|
||||
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
|
||||
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
|
||||
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
|
||||
|
||||
#ifdef RGB_ALPHA
|
||||
uint8x16x4_t rgba;
|
||||
rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
|
||||
rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
|
||||
rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
vst4q_u8(outptr, rgba);
|
||||
#else
|
||||
uint8x16x3_t rgb;
|
||||
rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
|
||||
rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
|
||||
rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
|
||||
/* Store RGB pixel data to memory. */
|
||||
vst3q_u8(outptr, rgb);
|
||||
#endif
|
||||
|
||||
/* Increment pointers. */
|
||||
inptr0 += 16;
|
||||
inptr1 += 8;
|
||||
inptr2 += 8;
|
||||
outptr += (RGB_PIXELSIZE * 16);
|
||||
}
|
||||
|
||||
if (cols_remaining > 0) {
|
||||
/* De-interleave Y component values into two separate vectors, one
|
||||
* containing the component values with even-numbered indices and one
|
||||
* containing the component values with odd-numbered indices.
|
||||
*/
|
||||
uint8x8x2_t y = vld2_u8(inptr0);
|
||||
uint8x8_t cb = vld1_u8(inptr1);
|
||||
uint8x8_t cr = vld1_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||
int16x8_t cb_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||
vrshrn_n_s32(g_sub_y_h, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
|
||||
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
|
||||
* "odd" Y component values. This effectively upsamples the chroma
|
||||
* components horizontally.
|
||||
*/
|
||||
int16x8_t g_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y.val[0]));
|
||||
int16x8_t r_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y.val[0]));
|
||||
int16x8_t b_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y.val[0]));
|
||||
int16x8_t g_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y.val[1]));
|
||||
int16x8_t r_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y.val[1]));
|
||||
int16x8_t b_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y.val[1]));
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255].
|
||||
* Re-interleave the "even" and "odd" component values.
|
||||
*/
|
||||
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
|
||||
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
|
||||
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
|
||||
|
||||
#ifdef RGB_ALPHA
|
||||
uint8x8x4_t rgba_h;
|
||||
rgba_h.val[RGB_RED] = r.val[1];
|
||||
rgba_h.val[RGB_GREEN] = g.val[1];
|
||||
rgba_h.val[RGB_BLUE] = b.val[1];
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
uint8x8x4_t rgba_l;
|
||||
rgba_l.val[RGB_RED] = r.val[0];
|
||||
rgba_l.val[RGB_GREEN] = g.val[0];
|
||||
rgba_l.val[RGB_BLUE] = b.val[0];
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 15:
|
||||
vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 14:
|
||||
vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 13:
|
||||
vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 12:
|
||||
vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 11:
|
||||
vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 10:
|
||||
vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 9:
|
||||
vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 8:
|
||||
vst4_u8(outptr, rgba_l);
|
||||
break;
|
||||
case 7:
|
||||
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
vst4_lane_u8(outptr, rgba_l, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#else
|
||||
uint8x8x3_t rgb_h;
|
||||
rgb_h.val[RGB_RED] = r.val[1];
|
||||
rgb_h.val[RGB_GREEN] = g.val[1];
|
||||
rgb_h.val[RGB_BLUE] = b.val[1];
|
||||
uint8x8x3_t rgb_l;
|
||||
rgb_l.val[RGB_RED] = r.val[0];
|
||||
rgb_l.val[RGB_GREEN] = g.val[0];
|
||||
rgb_l.val[RGB_BLUE] = b.val[0];
|
||||
/* Store RGB pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 15:
|
||||
vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 14:
|
||||
vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 13:
|
||||
vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 12:
|
||||
vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 11:
|
||||
vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 10:
|
||||
vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 9:
|
||||
vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 8:
|
||||
vst3_u8(outptr, rgb_l);
|
||||
break;
|
||||
case 7:
|
||||
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
vst3_lane_u8(outptr, rgb_l, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||
*
|
||||
* See comments above for details regarding color conversion and safe memory
|
||||
* access.
|
||||
*/
|
||||
|
||||
void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
|
||||
JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr,
|
||||
JSAMPARRAY output_buf)
|
||||
{
|
||||
JSAMPROW outptr0, outptr1;
|
||||
/* Pointers to Y (both rows), Cb, and Cr data */
|
||||
JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
|
||||
|
||||
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
|
||||
const int16x8_t neg_128 = vdupq_n_s16(-128);
|
||||
|
||||
inptr0_0 = input_buf[0][in_row_group_ctr * 2];
|
||||
inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
|
||||
inptr1 = input_buf[1][in_row_group_ctr];
|
||||
inptr2 = input_buf[2][in_row_group_ctr];
|
||||
outptr0 = output_buf[0];
|
||||
outptr1 = output_buf[1];
|
||||
|
||||
int cols_remaining = output_width;
|
||||
for (; cols_remaining >= 16; cols_remaining -= 16) {
|
||||
/* For each row, de-interleave Y component values into two separate
|
||||
* vectors, one containing the component values with even-numbered indices
|
||||
* and one containing the component values with odd-numbered indices.
|
||||
*/
|
||||
uint8x8x2_t y0 = vld2_u8(inptr0_0);
|
||||
uint8x8x2_t y1 = vld2_u8(inptr0_1);
|
||||
uint8x8_t cb = vld1_u8(inptr1);
|
||||
uint8x8_t cr = vld1_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||
int16x8_t cb_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||
vrshrn_n_s32(g_sub_y_h, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
|
||||
/* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
|
||||
* the "even" and "odd" Y component values. This effectively upsamples the
|
||||
* chroma components both horizontally and vertically.
|
||||
*/
|
||||
int16x8_t g0_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y0.val[0]));
|
||||
int16x8_t r0_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y0.val[0]));
|
||||
int16x8_t b0_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y0.val[0]));
|
||||
int16x8_t g0_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y0.val[1]));
|
||||
int16x8_t r0_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y0.val[1]));
|
||||
int16x8_t b0_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y0.val[1]));
|
||||
int16x8_t g1_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y1.val[0]));
|
||||
int16x8_t r1_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y1.val[0]));
|
||||
int16x8_t b1_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y1.val[0]));
|
||||
int16x8_t g1_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y1.val[1]));
|
||||
int16x8_t r1_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y1.val[1]));
|
||||
int16x8_t b1_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y1.val[1]));
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255].
|
||||
* Re-interleave the "even" and "odd" component values.
|
||||
*/
|
||||
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
|
||||
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
|
||||
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
|
||||
uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
|
||||
uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
|
||||
uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
|
||||
|
||||
#ifdef RGB_ALPHA
|
||||
uint8x16x4_t rgba0, rgba1;
|
||||
rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
|
||||
rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
|
||||
rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
|
||||
rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
|
||||
rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
|
||||
rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
|
||||
rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
vst4q_u8(outptr0, rgba0);
|
||||
vst4q_u8(outptr1, rgba1);
|
||||
#else
|
||||
uint8x16x3_t rgb0, rgb1;
|
||||
rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
|
||||
rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
|
||||
rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
|
||||
rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
|
||||
rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
|
||||
rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
|
||||
/* Store RGB pixel data to memory. */
|
||||
vst3q_u8(outptr0, rgb0);
|
||||
vst3q_u8(outptr1, rgb1);
|
||||
#endif
|
||||
|
||||
/* Increment pointers. */
|
||||
inptr0_0 += 16;
|
||||
inptr0_1 += 16;
|
||||
inptr1 += 8;
|
||||
inptr2 += 8;
|
||||
outptr0 += (RGB_PIXELSIZE * 16);
|
||||
outptr1 += (RGB_PIXELSIZE * 16);
|
||||
}
|
||||
|
||||
if (cols_remaining > 0) {
|
||||
/* For each row, de-interleave Y component values into two separate
|
||||
* vectors, one containing the component values with even-numbered indices
|
||||
* and one containing the component values with odd-numbered indices.
|
||||
*/
|
||||
uint8x8x2_t y0 = vld2_u8(inptr0_0);
|
||||
uint8x8x2_t y1 = vld2_u8(inptr0_1);
|
||||
uint8x8_t cb = vld1_u8(inptr1);
|
||||
uint8x8_t cr = vld1_u8(inptr2);
|
||||
/* Subtract 128 from Cb and Cr. */
|
||||
int16x8_t cr_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
|
||||
int16x8_t cb_128 =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
|
||||
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
|
||||
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
|
||||
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
|
||||
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
|
||||
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
|
||||
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
|
||||
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
|
||||
vrshrn_n_s32(g_sub_y_h, 15));
|
||||
/* Compute R-Y: 1.40200 * (Cr - 128) */
|
||||
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
|
||||
/* Compute B-Y: 1.77200 * (Cb - 128) */
|
||||
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
|
||||
/* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
|
||||
* the "even" and "odd" Y component values. This effectively upsamples the
|
||||
* chroma components both horizontally and vertically.
|
||||
*/
|
||||
int16x8_t g0_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y0.val[0]));
|
||||
int16x8_t r0_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y0.val[0]));
|
||||
int16x8_t b0_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y0.val[0]));
|
||||
int16x8_t g0_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y0.val[1]));
|
||||
int16x8_t r0_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y0.val[1]));
|
||||
int16x8_t b0_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y0.val[1]));
|
||||
int16x8_t g1_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y1.val[0]));
|
||||
int16x8_t r1_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y1.val[0]));
|
||||
int16x8_t b1_even =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y1.val[0]));
|
||||
int16x8_t g1_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
|
||||
y1.val[1]));
|
||||
int16x8_t r1_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
|
||||
y1.val[1]));
|
||||
int16x8_t b1_odd =
|
||||
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
|
||||
y1.val[1]));
|
||||
/* Convert each component to unsigned and narrow, clamping to [0-255].
|
||||
* Re-interleave the "even" and "odd" component values.
|
||||
*/
|
||||
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
|
||||
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
|
||||
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
|
||||
uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
|
||||
uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
|
||||
uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
|
||||
|
||||
#ifdef RGB_ALPHA
|
||||
uint8x8x4_t rgba0_h, rgba1_h;
|
||||
rgba0_h.val[RGB_RED] = r0.val[1];
|
||||
rgba1_h.val[RGB_RED] = r1.val[1];
|
||||
rgba0_h.val[RGB_GREEN] = g0.val[1];
|
||||
rgba1_h.val[RGB_GREEN] = g1.val[1];
|
||||
rgba0_h.val[RGB_BLUE] = b0.val[1];
|
||||
rgba1_h.val[RGB_BLUE] = b1.val[1];
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
|
||||
uint8x8x4_t rgba0_l, rgba1_l;
|
||||
rgba0_l.val[RGB_RED] = r0.val[0];
|
||||
rgba1_l.val[RGB_RED] = r1.val[0];
|
||||
rgba0_l.val[RGB_GREEN] = g0.val[0];
|
||||
rgba1_l.val[RGB_GREEN] = g1.val[0];
|
||||
rgba0_l.val[RGB_BLUE] = b0.val[0];
|
||||
rgba1_l.val[RGB_BLUE] = b1.val[0];
|
||||
/* Set alpha channel to opaque (0xFF). */
|
||||
rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
|
||||
/* Store RGBA pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 15:
|
||||
vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
|
||||
vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 14:
|
||||
vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
|
||||
vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 13:
|
||||
vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
|
||||
vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 12:
|
||||
vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
|
||||
vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 11:
|
||||
vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
|
||||
vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 10:
|
||||
vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
|
||||
vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 9:
|
||||
vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
|
||||
vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 8:
|
||||
vst4_u8(outptr0, rgba0_l);
|
||||
vst4_u8(outptr1, rgba1_l);
|
||||
break;
|
||||
case 7:
|
||||
vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
|
||||
vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
|
||||
vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
|
||||
vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
|
||||
vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
|
||||
vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
|
||||
vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
vst4_lane_u8(outptr0, rgba0_l, 0);
|
||||
vst4_lane_u8(outptr1, rgba1_l, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#else
|
||||
uint8x8x3_t rgb0_h, rgb1_h;
|
||||
rgb0_h.val[RGB_RED] = r0.val[1];
|
||||
rgb1_h.val[RGB_RED] = r1.val[1];
|
||||
rgb0_h.val[RGB_GREEN] = g0.val[1];
|
||||
rgb1_h.val[RGB_GREEN] = g1.val[1];
|
||||
rgb0_h.val[RGB_BLUE] = b0.val[1];
|
||||
rgb1_h.val[RGB_BLUE] = b1.val[1];
|
||||
|
||||
uint8x8x3_t rgb0_l, rgb1_l;
|
||||
rgb0_l.val[RGB_RED] = r0.val[0];
|
||||
rgb1_l.val[RGB_RED] = r1.val[0];
|
||||
rgb0_l.val[RGB_GREEN] = g0.val[0];
|
||||
rgb1_l.val[RGB_GREEN] = g1.val[0];
|
||||
rgb0_l.val[RGB_BLUE] = b0.val[0];
|
||||
rgb1_l.val[RGB_BLUE] = b1.val[0];
|
||||
/* Store RGB pixel data to memory. */
|
||||
switch (cols_remaining) {
|
||||
case 15:
|
||||
vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
|
||||
vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 14:
|
||||
vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
|
||||
vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 13:
|
||||
vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
|
||||
vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 12:
|
||||
vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
|
||||
vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 11:
|
||||
vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
|
||||
vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 10:
|
||||
vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
|
||||
vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 9:
|
||||
vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
|
||||
vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 8:
|
||||
vst3_u8(outptr0, rgb0_l);
|
||||
vst3_u8(outptr1, rgb1_l);
|
||||
break;
|
||||
case 7:
|
||||
vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
|
||||
vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 6:
|
||||
vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
|
||||
vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 5:
|
||||
vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
|
||||
vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 4:
|
||||
vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
|
||||
vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 3:
|
||||
vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
|
||||
vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 2:
|
||||
vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
|
||||
vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
case 1:
|
||||
vst3_lane_u8(outptr0, rgb0_l, 0);
|
||||
vst3_lane_u8(outptr1, rgb1_l, 0);
|
||||
FALLTHROUGH /*FALLTHROUGH*/
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
569
android/extern/libjpeg-turbo/simd/arm/jdsample-neon.c
vendored
Normal file
569
android/extern/libjpeg-turbo/simd/arm/jdsample-neon.c
vendored
Normal file
@@ -0,0 +1,569 @@
|
||||
/*
|
||||
* jdsample-neon.c - upsampling (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* The diagram below shows a row of samples produced by h2v1 downsampling.
|
||||
*
|
||||
* s0 s1 s2
|
||||
* +---------+---------+---------+
|
||||
* | | | |
|
||||
* | p0 p1 | p2 p3 | p4 p5 |
|
||||
* | | | |
|
||||
* +---------+---------+---------+
|
||||
*
|
||||
* Samples s0-s2 were created by averaging the original pixel component values
|
||||
* centered at positions p0-p5 above. To approximate those original pixel
|
||||
* component values, we proportionally blend the adjacent samples in each row.
|
||||
*
|
||||
* An upsampled pixel component value is computed by blending the sample
|
||||
* containing the pixel center with the nearest neighboring sample, in the
|
||||
* ratio 3:1. For example:
|
||||
* p1(upsampled) = 3/4 * s0 + 1/4 * s1
|
||||
* p2(upsampled) = 3/4 * s1 + 1/4 * s0
|
||||
* When computing the first and last pixel component values in the row, there
|
||||
* is no adjacent sample to blend, so:
|
||||
* p0(upsampled) = s0
|
||||
* p5(upsampled) = s2
|
||||
*/
|
||||
|
||||
void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
|
||||
JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr, outptr;
|
||||
int inrow;
|
||||
unsigned colctr;
|
||||
/* Set up constants. */
|
||||
const uint16x8_t one_u16 = vdupq_n_u16(1);
|
||||
const uint8x8_t three_u8 = vdup_n_u8(3);
|
||||
|
||||
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
|
||||
inptr = input_data[inrow];
|
||||
outptr = output_data[inrow];
|
||||
/* First pixel component value in this row of the original image */
|
||||
*outptr = (JSAMPLE)GETJSAMPLE(*inptr);
|
||||
|
||||
/* 3/4 * containing sample + 1/4 * nearest neighboring sample
|
||||
* For p1: containing sample = s0, nearest neighboring sample = s1
|
||||
* For p2: containing sample = s1, nearest neighboring sample = s0
|
||||
*/
|
||||
uint8x16_t s0 = vld1q_u8(inptr);
|
||||
uint8x16_t s1 = vld1q_u8(inptr + 1);
|
||||
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
|
||||
* denote low half and high half respectively.
|
||||
*/
|
||||
uint16x8_t s1_add_3s0_l =
|
||||
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
|
||||
uint16x8_t s1_add_3s0_h =
|
||||
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
|
||||
uint16x8_t s0_add_3s1_l =
|
||||
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
|
||||
uint16x8_t s0_add_3s1_h =
|
||||
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
|
||||
/* Add ordered dithering bias to odd pixel values. */
|
||||
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
|
||||
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
|
||||
|
||||
/* The offset is initially 1, because the first pixel component has already
|
||||
* been stored. However, in subsequent iterations of the SIMD loop, this
|
||||
* offset is (2 * colctr - 1) to stay within the bounds of the sample
|
||||
* buffers without having to resort to a slow scalar tail case for the last
|
||||
* (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
|
||||
* in jmemmgr.c for more details.
|
||||
*/
|
||||
unsigned outptr_offset = 1;
|
||||
uint8x16x2_t output_pixels;
|
||||
|
||||
/* We use software pipelining to maximise performance. The code indented
|
||||
* an extra two spaces begins the next iteration of the loop.
|
||||
*/
|
||||
for (colctr = 16; colctr < downsampled_width; colctr += 16) {
|
||||
|
||||
s0 = vld1q_u8(inptr + colctr - 1);
|
||||
s1 = vld1q_u8(inptr + colctr);
|
||||
|
||||
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
|
||||
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
|
||||
vrshrn_n_u16(s1_add_3s0_h, 2));
|
||||
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
|
||||
vshrn_n_u16(s0_add_3s1_h, 2));
|
||||
|
||||
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
|
||||
* denote low half and high half respectively.
|
||||
*/
|
||||
s1_add_3s0_l =
|
||||
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
|
||||
s1_add_3s0_h =
|
||||
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
|
||||
s0_add_3s1_l =
|
||||
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
|
||||
s0_add_3s1_h =
|
||||
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
|
||||
/* Add ordered dithering bias to odd pixel values. */
|
||||
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
|
||||
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
|
||||
|
||||
/* Store pixel component values to memory. */
|
||||
vst2q_u8(outptr + outptr_offset, output_pixels);
|
||||
outptr_offset = 2 * colctr - 1;
|
||||
}
|
||||
|
||||
/* Complete the last iteration of the loop. */
|
||||
|
||||
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
|
||||
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
|
||||
vrshrn_n_u16(s1_add_3s0_h, 2));
|
||||
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
|
||||
vshrn_n_u16(s0_add_3s1_h, 2));
|
||||
/* Store pixel component values to memory. */
|
||||
vst2q_u8(outptr + outptr_offset, output_pixels);
|
||||
|
||||
/* Last pixel component value in this row of the original image */
|
||||
outptr[2 * downsampled_width - 1] =
|
||||
GETJSAMPLE(inptr[downsampled_width - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* The diagram below shows an array of samples produced by h2v2 downsampling.
|
||||
*
|
||||
* s0 s1 s2
|
||||
* +---------+---------+---------+
|
||||
* | p0 p1 | p2 p3 | p4 p5 |
|
||||
* sA | | | |
|
||||
* | p6 p7 | p8 p9 | p10 p11|
|
||||
* +---------+---------+---------+
|
||||
* | p12 p13| p14 p15| p16 p17|
|
||||
* sB | | | |
|
||||
* | p18 p19| p20 p21| p22 p23|
|
||||
* +---------+---------+---------+
|
||||
* | p24 p25| p26 p27| p28 p29|
|
||||
* sC | | | |
|
||||
* | p30 p31| p32 p33| p34 p35|
|
||||
* +---------+---------+---------+
|
||||
*
|
||||
* Samples s0A-s2C were created by averaging the original pixel component
|
||||
* values centered at positions p0-p35 above. To approximate one of those
|
||||
* original pixel component values, we proportionally blend the sample
|
||||
* containing the pixel center with the nearest neighboring samples in each
|
||||
* row, column, and diagonal.
|
||||
*
|
||||
* An upsampled pixel component value is computed by first blending the sample
|
||||
* containing the pixel center with the nearest neighboring samples in the
|
||||
* same column, in the ratio 3:1, and then blending each column sum with the
|
||||
* nearest neighboring column sum, in the ratio 3:1. For example:
|
||||
* p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
|
||||
* 1/4 * (3/4 * s0B + 1/4 * s0A)
|
||||
* = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
|
||||
* When computing the first and last pixel component values in the row, there
|
||||
* is no horizontally adjacent sample to blend, so:
|
||||
* p12(upsampled) = 3/4 * s0B + 1/4 * s0A
|
||||
* p23(upsampled) = 3/4 * s2B + 1/4 * s2C
|
||||
* When computing the first and last pixel component values in the column,
|
||||
* there is no vertically adjacent sample to blend, so:
|
||||
* p2(upsampled) = 3/4 * s1A + 1/4 * s0A
|
||||
* p33(upsampled) = 3/4 * s1C + 1/4 * s2C
|
||||
* When computing the corner pixel component values, there is no adjacent
|
||||
* sample to blend, so:
|
||||
* p0(upsampled) = s0A
|
||||
* p35(upsampled) = s2C
|
||||
*/
|
||||
|
||||
void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
|
||||
JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
|
||||
int inrow, outrow;
|
||||
unsigned colctr;
|
||||
/* Set up constants. */
|
||||
const uint16x8_t seven_u16 = vdupq_n_u16(7);
|
||||
const uint8x8_t three_u8 = vdup_n_u8(3);
|
||||
const uint16x8_t three_u16 = vdupq_n_u16(3);
|
||||
|
||||
inrow = outrow = 0;
|
||||
while (outrow < max_v_samp_factor) {
|
||||
inptr0 = input_data[inrow - 1];
|
||||
inptr1 = input_data[inrow];
|
||||
inptr2 = input_data[inrow + 1];
|
||||
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
|
||||
* respectively.
|
||||
*/
|
||||
outptr0 = output_data[outrow++];
|
||||
outptr1 = output_data[outrow++];
|
||||
|
||||
/* First pixel component value in this row of the original image */
|
||||
int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
|
||||
*outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
|
||||
int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
|
||||
*outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
|
||||
|
||||
/* Step 1: Blend samples vertically in columns s0 and s1.
|
||||
* Leave the divide by 4 until the end, when it can be done for both
|
||||
* dimensions at once, right-shifting by 4.
|
||||
*/
|
||||
|
||||
/* Load and compute s0colsum0 and s0colsum1. */
|
||||
uint8x16_t s0A = vld1q_u8(inptr0);
|
||||
uint8x16_t s0B = vld1q_u8(inptr1);
|
||||
uint8x16_t s0C = vld1q_u8(inptr2);
|
||||
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
|
||||
* denote low half and high half respectively.
|
||||
*/
|
||||
uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
|
||||
vget_low_u8(s0B), three_u8);
|
||||
uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
|
||||
vget_high_u8(s0B), three_u8);
|
||||
uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
|
||||
vget_low_u8(s0B), three_u8);
|
||||
uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
|
||||
vget_high_u8(s0B), three_u8);
|
||||
/* Load and compute s1colsum0 and s1colsum1. */
|
||||
uint8x16_t s1A = vld1q_u8(inptr0 + 1);
|
||||
uint8x16_t s1B = vld1q_u8(inptr1 + 1);
|
||||
uint8x16_t s1C = vld1q_u8(inptr2 + 1);
|
||||
uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
|
||||
vget_low_u8(s1B), three_u8);
|
||||
uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
|
||||
vget_high_u8(s1B), three_u8);
|
||||
uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
|
||||
vget_low_u8(s1B), three_u8);
|
||||
uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
|
||||
vget_high_u8(s1B), three_u8);
|
||||
|
||||
/* Step 2: Blend the already-blended columns. */
|
||||
|
||||
uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
|
||||
uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
|
||||
uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
|
||||
uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
|
||||
uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
|
||||
uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
|
||||
uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
|
||||
uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
|
||||
/* Add ordered dithering bias to odd pixel values. */
|
||||
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
|
||||
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
|
||||
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
|
||||
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
|
||||
/* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
|
||||
uint8x16x2_t output_pixels0 = { {
|
||||
vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
|
||||
vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
|
||||
} };
|
||||
uint8x16x2_t output_pixels1 = { {
|
||||
vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
|
||||
vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
|
||||
} };
|
||||
|
||||
/* Store pixel component values to memory.
|
||||
* The minimum size of the output buffer for each row is 64 bytes => no
|
||||
* need to worry about buffer overflow here. See "Creation of 2-D sample
|
||||
* arrays" in jmemmgr.c for more details.
|
||||
*/
|
||||
vst2q_u8(outptr0 + 1, output_pixels0);
|
||||
vst2q_u8(outptr1 + 1, output_pixels1);
|
||||
|
||||
/* The first pixel of the image shifted our loads and stores by one byte.
|
||||
* We have to re-align on a 32-byte boundary at some point before the end
|
||||
* of the row (we do it now on the 32/33 pixel boundary) to stay within the
|
||||
* bounds of the sample buffers without having to resort to a slow scalar
|
||||
* tail case for the last (downsampled_width % 16) samples. See "Creation
|
||||
* of 2-D sample arrays" in jmemmgr.c for more details.
|
||||
*/
|
||||
for (colctr = 16; colctr < downsampled_width; colctr += 16) {
|
||||
/* Step 1: Blend samples vertically in columns s0 and s1. */
|
||||
|
||||
/* Load and compute s0colsum0 and s0colsum1. */
|
||||
s0A = vld1q_u8(inptr0 + colctr - 1);
|
||||
s0B = vld1q_u8(inptr1 + colctr - 1);
|
||||
s0C = vld1q_u8(inptr2 + colctr - 1);
|
||||
s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
|
||||
three_u8);
|
||||
s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
|
||||
three_u8);
|
||||
s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
|
||||
three_u8);
|
||||
s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
|
||||
three_u8);
|
||||
/* Load and compute s1colsum0 and s1colsum1. */
|
||||
s1A = vld1q_u8(inptr0 + colctr);
|
||||
s1B = vld1q_u8(inptr1 + colctr);
|
||||
s1C = vld1q_u8(inptr2 + colctr);
|
||||
s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
|
||||
three_u8);
|
||||
s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
|
||||
three_u8);
|
||||
s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
|
||||
three_u8);
|
||||
s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
|
||||
three_u8);
|
||||
|
||||
/* Step 2: Blend the already-blended columns. */
|
||||
|
||||
output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
|
||||
output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
|
||||
output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
|
||||
output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
|
||||
output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
|
||||
output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
|
||||
output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
|
||||
output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
|
||||
/* Add ordered dithering bias to odd pixel values. */
|
||||
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
|
||||
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
|
||||
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
|
||||
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
|
||||
/* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
|
||||
output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
|
||||
vshrn_n_u16(output0_p1_h, 4));
|
||||
output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
|
||||
vrshrn_n_u16(output0_p2_h, 4));
|
||||
output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
|
||||
vshrn_n_u16(output1_p1_h, 4));
|
||||
output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
|
||||
vrshrn_n_u16(output1_p2_h, 4));
|
||||
/* Store pixel component values to memory. */
|
||||
vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
|
||||
vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
|
||||
}
|
||||
|
||||
/* Last pixel component value in this row of the original image */
|
||||
int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
|
||||
GETJSAMPLE(inptr0[downsampled_width - 1]);
|
||||
outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
|
||||
int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
|
||||
GETJSAMPLE(inptr2[downsampled_width - 1]);
|
||||
outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
|
||||
inrow++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* The diagram below shows a column of samples produced by h1v2 downsampling
|
||||
* (or by losslessly rotating or transposing an h2v1-downsampled image.)
|
||||
*
|
||||
* +---------+
|
||||
* | p0 |
|
||||
* sA | |
|
||||
* | p1 |
|
||||
* +---------+
|
||||
* | p2 |
|
||||
* sB | |
|
||||
* | p3 |
|
||||
* +---------+
|
||||
* | p4 |
|
||||
* sC | |
|
||||
* | p5 |
|
||||
* +---------+
|
||||
*
|
||||
* Samples sA-sC were created by averaging the original pixel component values
|
||||
* centered at positions p0-p5 above. To approximate those original pixel
|
||||
* component values, we proportionally blend the adjacent samples in each
|
||||
* column.
|
||||
*
|
||||
* An upsampled pixel component value is computed by blending the sample
|
||||
* containing the pixel center with the nearest neighboring sample, in the
|
||||
* ratio 3:1. For example:
|
||||
* p1(upsampled) = 3/4 * sA + 1/4 * sB
|
||||
* p2(upsampled) = 3/4 * sB + 1/4 * sA
|
||||
* When computing the first and last pixel component values in the column,
|
||||
* there is no adjacent sample to blend, so:
|
||||
* p0(upsampled) = sA
|
||||
* p5(upsampled) = sC
|
||||
*/
|
||||
|
||||
void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
|
||||
JDIMENSION downsampled_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
|
||||
int inrow, outrow;
|
||||
unsigned colctr;
|
||||
/* Set up constants. */
|
||||
const uint16x8_t one_u16 = vdupq_n_u16(1);
|
||||
const uint8x8_t three_u8 = vdup_n_u8(3);
|
||||
|
||||
inrow = outrow = 0;
|
||||
while (outrow < max_v_samp_factor) {
|
||||
inptr0 = input_data[inrow - 1];
|
||||
inptr1 = input_data[inrow];
|
||||
inptr2 = input_data[inrow + 1];
|
||||
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
|
||||
* respectively.
|
||||
*/
|
||||
outptr0 = output_data[outrow++];
|
||||
outptr1 = output_data[outrow++];
|
||||
inrow++;
|
||||
|
||||
/* The size of the input and output buffers is always a multiple of 32
|
||||
* bytes => no need to worry about buffer overflow when reading/writing
|
||||
* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
|
||||
* details.
|
||||
*/
|
||||
for (colctr = 0; colctr < downsampled_width; colctr += 16) {
|
||||
/* Load samples. */
|
||||
uint8x16_t sA = vld1q_u8(inptr0 + colctr);
|
||||
uint8x16_t sB = vld1q_u8(inptr1 + colctr);
|
||||
uint8x16_t sC = vld1q_u8(inptr2 + colctr);
|
||||
/* Blend samples vertically. */
|
||||
uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
|
||||
vget_low_u8(sB), three_u8);
|
||||
uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
|
||||
vget_high_u8(sB), three_u8);
|
||||
uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
|
||||
vget_low_u8(sB), three_u8);
|
||||
uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
|
||||
vget_high_u8(sB), three_u8);
|
||||
/* Add ordered dithering bias to pixel values in even output rows. */
|
||||
colsum0_l = vaddq_u16(colsum0_l, one_u16);
|
||||
colsum0_h = vaddq_u16(colsum0_h, one_u16);
|
||||
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
|
||||
uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
|
||||
vshrn_n_u16(colsum0_h, 2));
|
||||
uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
|
||||
vrshrn_n_u16(colsum1_h, 2));
|
||||
/* Store pixel component values to memory. */
|
||||
vst1q_u8(outptr0 + colctr, output_pixels0);
|
||||
vst1q_u8(outptr1 + colctr, output_pixels1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* The diagram below shows a row of samples produced by h2v1 downsampling.
|
||||
*
|
||||
* s0 s1
|
||||
* +---------+---------+
|
||||
* | | |
|
||||
* | p0 p1 | p2 p3 |
|
||||
* | | |
|
||||
* +---------+---------+
|
||||
*
|
||||
* Samples s0 and s1 were created by averaging the original pixel component
|
||||
* values centered at positions p0-p3 above. To approximate those original
|
||||
* pixel component values, we duplicate the samples horizontally:
|
||||
* p0(upsampled) = p1(upsampled) = s0
|
||||
* p2(upsampled) = p3(upsampled) = s1
|
||||
*/
|
||||
|
||||
void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr, outptr;
|
||||
int inrow;
|
||||
unsigned colctr;
|
||||
|
||||
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
|
||||
inptr = input_data[inrow];
|
||||
outptr = output_data[inrow];
|
||||
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
|
||||
uint8x16_t samples = vld1q_u8(inptr + colctr);
|
||||
/* Duplicate the samples. The store operation below interleaves them so
|
||||
* that adjacent pixel component values take on the same sample value,
|
||||
* per above.
|
||||
*/
|
||||
uint8x16x2_t output_pixels = { { samples, samples } };
|
||||
/* Store pixel component values to memory.
|
||||
* Due to the way sample buffers are allocated, we don't need to worry
|
||||
* about tail cases when output_width is not a multiple of 32. See
|
||||
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
|
||||
*/
|
||||
vst2q_u8(outptr + 2 * colctr, output_pixels);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* The diagram below shows an array of samples produced by h2v2 downsampling.
|
||||
*
|
||||
* s0 s1
|
||||
* +---------+---------+
|
||||
* | p0 p1 | p2 p3 |
|
||||
* sA | | |
|
||||
* | p4 p5 | p6 p7 |
|
||||
* +---------+---------+
|
||||
* | p8 p9 | p10 p11|
|
||||
* sB | | |
|
||||
* | p12 p13| p14 p15|
|
||||
* +---------+---------+
|
||||
*
|
||||
* Samples s0A-s1B were created by averaging the original pixel component
|
||||
* values centered at positions p0-p15 above. To approximate those original
|
||||
* pixel component values, we duplicate the samples both horizontally and
|
||||
* vertically:
|
||||
* p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
|
||||
* p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
|
||||
* p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
|
||||
* p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
|
||||
*/
|
||||
|
||||
void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY *output_data_ptr)
|
||||
{
|
||||
JSAMPARRAY output_data = *output_data_ptr;
|
||||
JSAMPROW inptr, outptr0, outptr1;
|
||||
int inrow, outrow;
|
||||
unsigned colctr;
|
||||
|
||||
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
|
||||
inptr = input_data[inrow];
|
||||
outptr0 = output_data[outrow++];
|
||||
outptr1 = output_data[outrow++];
|
||||
|
||||
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
|
||||
uint8x16_t samples = vld1q_u8(inptr + colctr);
|
||||
/* Duplicate the samples. The store operation below interleaves them so
|
||||
* that adjacent pixel component values take on the same sample value,
|
||||
* per above.
|
||||
*/
|
||||
uint8x16x2_t output_pixels = { { samples, samples } };
|
||||
/* Store pixel component values for both output rows to memory.
|
||||
* Due to the way sample buffers are allocated, we don't need to worry
|
||||
* about tail cases when output_width is not a multiple of 32. See
|
||||
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
|
||||
*/
|
||||
vst2q_u8(outptr0 + 2 * colctr, output_pixels);
|
||||
vst2q_u8(outptr1 + 2 * colctr, output_pixels);
|
||||
}
|
||||
}
|
||||
}
|
||||
214
android/extern/libjpeg-turbo/simd/arm/jfdctfst-neon.c
vendored
Normal file
214
android/extern/libjpeg-turbo/simd/arm/jfdctfst-neon.c
vendored
Normal file
@@ -0,0 +1,214 @@
|
||||
/*
|
||||
* jfdctfst-neon.c - fast integer FDCT (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
|
||||
* (Discrete Cosine Transform) on one block of samples. It uses the same
|
||||
* calculations and produces exactly the same output as IJG's original
|
||||
* jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
|
||||
*
|
||||
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||
* 0.382683433 = 12544 * 2^-15
|
||||
* 0.541196100 = 17795 * 2^-15
|
||||
* 0.707106781 = 23168 * 2^-15
|
||||
* 0.306562965 = 9984 * 2^-15
|
||||
*
|
||||
* See jfdctfst.c for further details of the DCT algorithm. Where possible,
|
||||
* the variable names and comments here in jsimd_fdct_ifast_neon() match up
|
||||
* with those in jpeg_fdct_ifast().
|
||||
*/
|
||||
|
||||
#define F_0_382 12544
|
||||
#define F_0_541 17792
|
||||
#define F_0_707 23168
|
||||
#define F_0_306 9984
|
||||
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
|
||||
F_0_382, F_0_541, F_0_707, F_0_306
|
||||
};
|
||||
|
||||
void jsimd_fdct_ifast_neon(DCTELEM *data)
|
||||
{
|
||||
/* Load an 8x8 block of samples into Neon registers. De-interleaving loads
|
||||
* are used, followed by vuzp to transpose the block such that we have a
|
||||
* column of samples per vector - allowing all rows to be processed at once.
|
||||
*/
|
||||
int16x8x4_t data1 = vld4q_s16(data);
|
||||
int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
|
||||
|
||||
int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
|
||||
int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
|
||||
int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
|
||||
int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
|
||||
|
||||
int16x8_t col0 = cols_04.val[0];
|
||||
int16x8_t col1 = cols_15.val[0];
|
||||
int16x8_t col2 = cols_26.val[0];
|
||||
int16x8_t col3 = cols_37.val[0];
|
||||
int16x8_t col4 = cols_04.val[1];
|
||||
int16x8_t col5 = cols_15.val[1];
|
||||
int16x8_t col6 = cols_26.val[1];
|
||||
int16x8_t col7 = cols_37.val[1];
|
||||
|
||||
/* Pass 1: process rows. */
|
||||
|
||||
/* Load DCT conversion constants. */
|
||||
const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
|
||||
|
||||
int16x8_t tmp0 = vaddq_s16(col0, col7);
|
||||
int16x8_t tmp7 = vsubq_s16(col0, col7);
|
||||
int16x8_t tmp1 = vaddq_s16(col1, col6);
|
||||
int16x8_t tmp6 = vsubq_s16(col1, col6);
|
||||
int16x8_t tmp2 = vaddq_s16(col2, col5);
|
||||
int16x8_t tmp5 = vsubq_s16(col2, col5);
|
||||
int16x8_t tmp3 = vaddq_s16(col3, col4);
|
||||
int16x8_t tmp4 = vsubq_s16(col3, col4);
|
||||
|
||||
/* Even part */
|
||||
int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
|
||||
int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
|
||||
int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
|
||||
int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
|
||||
|
||||
col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
|
||||
col4 = vsubq_s16(tmp10, tmp11);
|
||||
|
||||
int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
|
||||
col2 = vaddq_s16(tmp13, z1); /* phase 5 */
|
||||
col6 = vsubq_s16(tmp13, z1);
|
||||
|
||||
/* Odd part */
|
||||
tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
|
||||
tmp11 = vaddq_s16(tmp5, tmp6);
|
||||
tmp12 = vaddq_s16(tmp6, tmp7);
|
||||
|
||||
int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
|
||||
int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
|
||||
z2 = vaddq_s16(z2, z5);
|
||||
int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
|
||||
z5 = vaddq_s16(tmp12, z5);
|
||||
z4 = vaddq_s16(z4, z5);
|
||||
int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
|
||||
|
||||
int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
|
||||
int16x8_t z13 = vsubq_s16(tmp7, z3);
|
||||
|
||||
col5 = vaddq_s16(z13, z2); /* phase 6 */
|
||||
col3 = vsubq_s16(z13, z2);
|
||||
col1 = vaddq_s16(z11, z4);
|
||||
col7 = vsubq_s16(z11, z4);
|
||||
|
||||
/* Transpose to work on columns in pass 2. */
|
||||
int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
|
||||
int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
|
||||
int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
|
||||
int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
|
||||
|
||||
int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
|
||||
vreinterpretq_s32_s16(cols_45.val[0]));
|
||||
int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
|
||||
vreinterpretq_s32_s16(cols_45.val[1]));
|
||||
int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
|
||||
vreinterpretq_s32_s16(cols_67.val[0]));
|
||||
int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
|
||||
vreinterpretq_s32_s16(cols_67.val[1]));
|
||||
|
||||
int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
|
||||
int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
|
||||
int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
|
||||
int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
|
||||
|
||||
int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
|
||||
int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
|
||||
int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
|
||||
int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
|
||||
int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
|
||||
int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
|
||||
int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
|
||||
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
|
||||
|
||||
/* Pass 2: process columns. */
|
||||
|
||||
tmp0 = vaddq_s16(row0, row7);
|
||||
tmp7 = vsubq_s16(row0, row7);
|
||||
tmp1 = vaddq_s16(row1, row6);
|
||||
tmp6 = vsubq_s16(row1, row6);
|
||||
tmp2 = vaddq_s16(row2, row5);
|
||||
tmp5 = vsubq_s16(row2, row5);
|
||||
tmp3 = vaddq_s16(row3, row4);
|
||||
tmp4 = vsubq_s16(row3, row4);
|
||||
|
||||
/* Even part */
|
||||
tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
|
||||
tmp13 = vsubq_s16(tmp0, tmp3);
|
||||
tmp11 = vaddq_s16(tmp1, tmp2);
|
||||
tmp12 = vsubq_s16(tmp1, tmp2);
|
||||
|
||||
row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
|
||||
row4 = vsubq_s16(tmp10, tmp11);
|
||||
|
||||
z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
|
||||
row2 = vaddq_s16(tmp13, z1); /* phase 5 */
|
||||
row6 = vsubq_s16(tmp13, z1);
|
||||
|
||||
/* Odd part */
|
||||
tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
|
||||
tmp11 = vaddq_s16(tmp5, tmp6);
|
||||
tmp12 = vaddq_s16(tmp6, tmp7);
|
||||
|
||||
z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
|
||||
z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
|
||||
z2 = vaddq_s16(z2, z5);
|
||||
z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
|
||||
z5 = vaddq_s16(tmp12, z5);
|
||||
z4 = vaddq_s16(z4, z5);
|
||||
z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
|
||||
|
||||
z11 = vaddq_s16(tmp7, z3); /* phase 5 */
|
||||
z13 = vsubq_s16(tmp7, z3);
|
||||
|
||||
row5 = vaddq_s16(z13, z2); /* phase 6 */
|
||||
row3 = vsubq_s16(z13, z2);
|
||||
row1 = vaddq_s16(z11, z4);
|
||||
row7 = vsubq_s16(z11, z4);
|
||||
|
||||
vst1q_s16(data + 0 * DCTSIZE, row0);
|
||||
vst1q_s16(data + 1 * DCTSIZE, row1);
|
||||
vst1q_s16(data + 2 * DCTSIZE, row2);
|
||||
vst1q_s16(data + 3 * DCTSIZE, row3);
|
||||
vst1q_s16(data + 4 * DCTSIZE, row4);
|
||||
vst1q_s16(data + 5 * DCTSIZE, row5);
|
||||
vst1q_s16(data + 6 * DCTSIZE, row6);
|
||||
vst1q_s16(data + 7 * DCTSIZE, row7);
|
||||
}
|
||||
376
android/extern/libjpeg-turbo/simd/arm/jfdctint-neon.c
vendored
Normal file
376
android/extern/libjpeg-turbo/simd/arm/jfdctint-neon.c
vendored
Normal file
@@ -0,0 +1,376 @@
|
||||
/*
|
||||
* jfdctint-neon.c - accurate integer FDCT (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
#include "neon-compat.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
|
||||
* (Discrete Cosine Transform) on one block of samples. It uses the same
|
||||
* calculations and produces exactly the same output as IJG's original
|
||||
* jpeg_fdct_islow() function, which can be found in jfdctint.c.
|
||||
*
|
||||
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||
* 0.298631336 = 2446 * 2^-13
|
||||
* 0.390180644 = 3196 * 2^-13
|
||||
* 0.541196100 = 4433 * 2^-13
|
||||
* 0.765366865 = 6270 * 2^-13
|
||||
* 0.899976223 = 7373 * 2^-13
|
||||
* 1.175875602 = 9633 * 2^-13
|
||||
* 1.501321110 = 12299 * 2^-13
|
||||
* 1.847759065 = 15137 * 2^-13
|
||||
* 1.961570560 = 16069 * 2^-13
|
||||
* 2.053119869 = 16819 * 2^-13
|
||||
* 2.562915447 = 20995 * 2^-13
|
||||
* 3.072711026 = 25172 * 2^-13
|
||||
*
|
||||
* See jfdctint.c for further details of the DCT algorithm. Where possible,
|
||||
* the variable names and comments here in jsimd_fdct_islow_neon() match up
|
||||
* with those in jpeg_fdct_islow().
|
||||
*/
|
||||
|
||||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
|
||||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
|
||||
|
||||
#define F_0_298 2446
|
||||
#define F_0_390 3196
|
||||
#define F_0_541 4433
|
||||
#define F_0_765 6270
|
||||
#define F_0_899 7373
|
||||
#define F_1_175 9633
|
||||
#define F_1_501 12299
|
||||
#define F_1_847 15137
|
||||
#define F_1_961 16069
|
||||
#define F_2_053 16819
|
||||
#define F_2_562 20995
|
||||
#define F_3_072 25172
|
||||
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
|
||||
F_0_298, -F_0_390, F_0_541, F_0_765,
|
||||
-F_0_899, F_1_175, F_1_501, -F_1_847,
|
||||
-F_1_961, F_2_053, -F_2_562, F_3_072
|
||||
};
|
||||
|
||||
void jsimd_fdct_islow_neon(DCTELEM *data)
|
||||
{
|
||||
/* Load DCT constants. */
|
||||
#ifdef HAVE_VLD1_S16_X3
|
||||
const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
|
||||
#else
|
||||
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */
|
||||
const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
|
||||
const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
|
||||
const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
|
||||
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
|
||||
#endif
|
||||
|
||||
/* Load an 8x8 block of samples into Neon registers. De-interleaving loads
|
||||
* are used, followed by vuzp to transpose the block such that we have a
|
||||
* column of samples per vector - allowing all rows to be processed at once.
|
||||
*/
|
||||
int16x8x4_t s_rows_0123 = vld4q_s16(data);
|
||||
int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
|
||||
|
||||
int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
|
||||
int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
|
||||
int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
|
||||
int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
|
||||
|
||||
int16x8_t col0 = cols_04.val[0];
|
||||
int16x8_t col1 = cols_15.val[0];
|
||||
int16x8_t col2 = cols_26.val[0];
|
||||
int16x8_t col3 = cols_37.val[0];
|
||||
int16x8_t col4 = cols_04.val[1];
|
||||
int16x8_t col5 = cols_15.val[1];
|
||||
int16x8_t col6 = cols_26.val[1];
|
||||
int16x8_t col7 = cols_37.val[1];
|
||||
|
||||
/* Pass 1: process rows. */
|
||||
|
||||
int16x8_t tmp0 = vaddq_s16(col0, col7);
|
||||
int16x8_t tmp7 = vsubq_s16(col0, col7);
|
||||
int16x8_t tmp1 = vaddq_s16(col1, col6);
|
||||
int16x8_t tmp6 = vsubq_s16(col1, col6);
|
||||
int16x8_t tmp2 = vaddq_s16(col2, col5);
|
||||
int16x8_t tmp5 = vsubq_s16(col2, col5);
|
||||
int16x8_t tmp3 = vaddq_s16(col3, col4);
|
||||
int16x8_t tmp4 = vsubq_s16(col3, col4);
|
||||
|
||||
/* Even part */
|
||||
int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
|
||||
int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
|
||||
int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
|
||||
int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
|
||||
|
||||
col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
|
||||
col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
|
||||
|
||||
int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
|
||||
int32x4_t z1_l =
|
||||
vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
|
||||
int32x4_t z1_h =
|
||||
vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
|
||||
|
||||
int32x4_t col2_scaled_l =
|
||||
vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
|
||||
int32x4_t col2_scaled_h =
|
||||
vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
|
||||
col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
|
||||
vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
|
||||
|
||||
int32x4_t col6_scaled_l =
|
||||
vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
|
||||
int32x4_t col6_scaled_h =
|
||||
vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
|
||||
col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
|
||||
vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
|
||||
|
||||
/* Odd part */
|
||||
int16x8_t z1 = vaddq_s16(tmp4, tmp7);
|
||||
int16x8_t z2 = vaddq_s16(tmp5, tmp6);
|
||||
int16x8_t z3 = vaddq_s16(tmp4, tmp6);
|
||||
int16x8_t z4 = vaddq_s16(tmp5, tmp7);
|
||||
/* sqrt(2) * c3 */
|
||||
int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
|
||||
int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
|
||||
z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
|
||||
z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
|
||||
|
||||
/* sqrt(2) * (-c1+c3+c5-c7) */
|
||||
int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
|
||||
int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
|
||||
/* sqrt(2) * ( c1+c3-c5+c7) */
|
||||
int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
|
||||
int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
|
||||
/* sqrt(2) * ( c1+c3+c5-c7) */
|
||||
int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
|
||||
int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
|
||||
/* sqrt(2) * ( c1+c3-c5-c7) */
|
||||
int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
|
||||
int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
|
||||
|
||||
/* sqrt(2) * (c7-c3) */
|
||||
z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
|
||||
z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
|
||||
/* sqrt(2) * (-c1-c3) */
|
||||
int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
|
||||
int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
|
||||
/* sqrt(2) * (-c3-c5) */
|
||||
int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
|
||||
int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
|
||||
/* sqrt(2) * (c5-c3) */
|
||||
int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
|
||||
int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
|
||||
|
||||
z3_l = vaddq_s32(z3_l, z5_l);
|
||||
z3_h = vaddq_s32(z3_h, z5_h);
|
||||
z4_l = vaddq_s32(z4_l, z5_l);
|
||||
z4_h = vaddq_s32(z4_h, z5_h);
|
||||
|
||||
tmp4_l = vaddq_s32(tmp4_l, z1_l);
|
||||
tmp4_h = vaddq_s32(tmp4_h, z1_h);
|
||||
tmp4_l = vaddq_s32(tmp4_l, z3_l);
|
||||
tmp4_h = vaddq_s32(tmp4_h, z3_h);
|
||||
col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
|
||||
vrshrn_n_s32(tmp4_h, DESCALE_P1));
|
||||
|
||||
tmp5_l = vaddq_s32(tmp5_l, z2_l);
|
||||
tmp5_h = vaddq_s32(tmp5_h, z2_h);
|
||||
tmp5_l = vaddq_s32(tmp5_l, z4_l);
|
||||
tmp5_h = vaddq_s32(tmp5_h, z4_h);
|
||||
col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
|
||||
vrshrn_n_s32(tmp5_h, DESCALE_P1));
|
||||
|
||||
tmp6_l = vaddq_s32(tmp6_l, z2_l);
|
||||
tmp6_h = vaddq_s32(tmp6_h, z2_h);
|
||||
tmp6_l = vaddq_s32(tmp6_l, z3_l);
|
||||
tmp6_h = vaddq_s32(tmp6_h, z3_h);
|
||||
col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
|
||||
vrshrn_n_s32(tmp6_h, DESCALE_P1));
|
||||
|
||||
tmp7_l = vaddq_s32(tmp7_l, z1_l);
|
||||
tmp7_h = vaddq_s32(tmp7_h, z1_h);
|
||||
tmp7_l = vaddq_s32(tmp7_l, z4_l);
|
||||
tmp7_h = vaddq_s32(tmp7_h, z4_h);
|
||||
col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
|
||||
vrshrn_n_s32(tmp7_h, DESCALE_P1));
|
||||
|
||||
/* Transpose to work on columns in pass 2. */
|
||||
int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
|
||||
int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
|
||||
int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
|
||||
int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
|
||||
|
||||
int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
|
||||
vreinterpretq_s32_s16(cols_45.val[0]));
|
||||
int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
|
||||
vreinterpretq_s32_s16(cols_45.val[1]));
|
||||
int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
|
||||
vreinterpretq_s32_s16(cols_67.val[0]));
|
||||
int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
|
||||
vreinterpretq_s32_s16(cols_67.val[1]));
|
||||
|
||||
int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
|
||||
int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
|
||||
int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
|
||||
int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
|
||||
|
||||
int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
|
||||
int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
|
||||
int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
|
||||
int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
|
||||
int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
|
||||
int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
|
||||
int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
|
||||
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
|
||||
|
||||
/* Pass 2: process columns. */
|
||||
|
||||
tmp0 = vaddq_s16(row0, row7);
|
||||
tmp7 = vsubq_s16(row0, row7);
|
||||
tmp1 = vaddq_s16(row1, row6);
|
||||
tmp6 = vsubq_s16(row1, row6);
|
||||
tmp2 = vaddq_s16(row2, row5);
|
||||
tmp5 = vsubq_s16(row2, row5);
|
||||
tmp3 = vaddq_s16(row3, row4);
|
||||
tmp4 = vsubq_s16(row3, row4);
|
||||
|
||||
/* Even part */
|
||||
tmp10 = vaddq_s16(tmp0, tmp3);
|
||||
tmp13 = vsubq_s16(tmp0, tmp3);
|
||||
tmp11 = vaddq_s16(tmp1, tmp2);
|
||||
tmp12 = vsubq_s16(tmp1, tmp2);
|
||||
|
||||
row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
|
||||
row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
|
||||
|
||||
tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
|
||||
z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
|
||||
z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
|
||||
|
||||
int32x4_t row2_scaled_l =
|
||||
vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
|
||||
int32x4_t row2_scaled_h =
|
||||
vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
|
||||
row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
|
||||
vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
|
||||
|
||||
int32x4_t row6_scaled_l =
|
||||
vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
|
||||
int32x4_t row6_scaled_h =
|
||||
vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
|
||||
row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
|
||||
vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
|
||||
|
||||
/* Odd part */
|
||||
z1 = vaddq_s16(tmp4, tmp7);
|
||||
z2 = vaddq_s16(tmp5, tmp6);
|
||||
z3 = vaddq_s16(tmp4, tmp6);
|
||||
z4 = vaddq_s16(tmp5, tmp7);
|
||||
/* sqrt(2) * c3 */
|
||||
z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
|
||||
z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
|
||||
z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
|
||||
z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
|
||||
|
||||
/* sqrt(2) * (-c1+c3+c5-c7) */
|
||||
tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
|
||||
tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
|
||||
/* sqrt(2) * ( c1+c3-c5+c7) */
|
||||
tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
|
||||
tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
|
||||
/* sqrt(2) * ( c1+c3+c5-c7) */
|
||||
tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
|
||||
tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
|
||||
/* sqrt(2) * ( c1+c3-c5-c7) */
|
||||
tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
|
||||
tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
|
||||
|
||||
/* sqrt(2) * (c7-c3) */
|
||||
z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
|
||||
z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
|
||||
/* sqrt(2) * (-c1-c3) */
|
||||
z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
|
||||
z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
|
||||
/* sqrt(2) * (-c3-c5) */
|
||||
z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
|
||||
z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
|
||||
/* sqrt(2) * (c5-c3) */
|
||||
z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
|
||||
z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
|
||||
|
||||
z3_l = vaddq_s32(z3_l, z5_l);
|
||||
z3_h = vaddq_s32(z3_h, z5_h);
|
||||
z4_l = vaddq_s32(z4_l, z5_l);
|
||||
z4_h = vaddq_s32(z4_h, z5_h);
|
||||
|
||||
tmp4_l = vaddq_s32(tmp4_l, z1_l);
|
||||
tmp4_h = vaddq_s32(tmp4_h, z1_h);
|
||||
tmp4_l = vaddq_s32(tmp4_l, z3_l);
|
||||
tmp4_h = vaddq_s32(tmp4_h, z3_h);
|
||||
row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
|
||||
vrshrn_n_s32(tmp4_h, DESCALE_P2));
|
||||
|
||||
tmp5_l = vaddq_s32(tmp5_l, z2_l);
|
||||
tmp5_h = vaddq_s32(tmp5_h, z2_h);
|
||||
tmp5_l = vaddq_s32(tmp5_l, z4_l);
|
||||
tmp5_h = vaddq_s32(tmp5_h, z4_h);
|
||||
row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
|
||||
vrshrn_n_s32(tmp5_h, DESCALE_P2));
|
||||
|
||||
tmp6_l = vaddq_s32(tmp6_l, z2_l);
|
||||
tmp6_h = vaddq_s32(tmp6_h, z2_h);
|
||||
tmp6_l = vaddq_s32(tmp6_l, z3_l);
|
||||
tmp6_h = vaddq_s32(tmp6_h, z3_h);
|
||||
row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
|
||||
vrshrn_n_s32(tmp6_h, DESCALE_P2));
|
||||
|
||||
tmp7_l = vaddq_s32(tmp7_l, z1_l);
|
||||
tmp7_h = vaddq_s32(tmp7_h, z1_h);
|
||||
tmp7_l = vaddq_s32(tmp7_l, z4_l);
|
||||
tmp7_h = vaddq_s32(tmp7_h, z4_h);
|
||||
row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
|
||||
vrshrn_n_s32(tmp7_h, DESCALE_P2));
|
||||
|
||||
vst1q_s16(data + 0 * DCTSIZE, row0);
|
||||
vst1q_s16(data + 1 * DCTSIZE, row1);
|
||||
vst1q_s16(data + 2 * DCTSIZE, row2);
|
||||
vst1q_s16(data + 3 * DCTSIZE, row3);
|
||||
vst1q_s16(data + 4 * DCTSIZE, row4);
|
||||
vst1q_s16(data + 5 * DCTSIZE, row5);
|
||||
vst1q_s16(data + 6 * DCTSIZE, row6);
|
||||
vst1q_s16(data + 7 * DCTSIZE, row7);
|
||||
}
|
||||
472
android/extern/libjpeg-turbo/simd/arm/jidctfst-neon.c
vendored
Normal file
472
android/extern/libjpeg-turbo/simd/arm/jidctfst-neon.c
vendored
Normal file
@@ -0,0 +1,472 @@
|
||||
/*
|
||||
* jidctfst-neon.c - fast integer IDCT (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
|
||||
* inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
|
||||
* uses the same calculations and produces exactly the same output as IJG's
|
||||
* original jpeg_idct_ifast() function, which can be found in jidctfst.c.
|
||||
*
|
||||
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||
* 0.082392200 = 2688 * 2^-15
|
||||
* 0.414213562 = 13568 * 2^-15
|
||||
* 0.847759065 = 27776 * 2^-15
|
||||
* 0.613125930 = 20096 * 2^-15
|
||||
*
|
||||
* See jidctfst.c for further details of the IDCT algorithm. Where possible,
|
||||
* the variable names and comments here in jsimd_idct_ifast_neon() match up
|
||||
* with those in jpeg_idct_ifast().
|
||||
*/
|
||||
|
||||
#define PASS1_BITS 2
|
||||
|
||||
#define F_0_082 2688
|
||||
#define F_0_414 13568
|
||||
#define F_0_847 27776
|
||||
#define F_0_613 20096
|
||||
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
|
||||
F_0_082, F_0_414, F_0_847, F_0_613
|
||||
};
|
||||
|
||||
void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
IFAST_MULT_TYPE *quantptr = dct_table;
|
||||
|
||||
/* Load DCT coefficients. */
|
||||
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
|
||||
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
|
||||
int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
|
||||
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
|
||||
int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
|
||||
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
|
||||
int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
|
||||
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
|
||||
|
||||
/* Load quantization table values for DC coefficients. */
|
||||
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
|
||||
/* Dequantize DC coefficients. */
|
||||
row0 = vmulq_s16(row0, quant_row0);
|
||||
|
||||
/* Construct bitmap to test if all AC coefficients are 0. */
|
||||
int16x8_t bitmap = vorrq_s16(row1, row2);
|
||||
bitmap = vorrq_s16(bitmap, row3);
|
||||
bitmap = vorrq_s16(bitmap, row4);
|
||||
bitmap = vorrq_s16(bitmap, row5);
|
||||
bitmap = vorrq_s16(bitmap, row6);
|
||||
bitmap = vorrq_s16(bitmap, row7);
|
||||
|
||||
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
|
||||
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
|
||||
|
||||
/* Load IDCT conversion constants. */
|
||||
const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
|
||||
|
||||
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
|
||||
/* All AC coefficients are zero.
|
||||
* Compute DC values and duplicate into vectors.
|
||||
*/
|
||||
int16x8_t dcval = row0;
|
||||
row1 = dcval;
|
||||
row2 = dcval;
|
||||
row3 = dcval;
|
||||
row4 = dcval;
|
||||
row5 = dcval;
|
||||
row6 = dcval;
|
||||
row7 = dcval;
|
||||
} else if (left_ac_bitmap == 0) {
|
||||
/* AC coefficients are zero for columns 0, 1, 2, and 3.
|
||||
* Use DC values for these columns.
|
||||
*/
|
||||
int16x4_t dcval = vget_low_s16(row0);
|
||||
|
||||
/* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
|
||||
|
||||
/* Load quantization table. */
|
||||
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
|
||||
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
|
||||
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
|
||||
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
|
||||
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
|
||||
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
|
||||
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
|
||||
|
||||
/* Even part: dequantize DCT coefficients. */
|
||||
int16x4_t tmp0 = vget_high_s16(row0);
|
||||
int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
|
||||
int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
|
||||
int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
|
||||
|
||||
int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
|
||||
int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
|
||||
|
||||
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
|
||||
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
|
||||
int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
|
||||
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
|
||||
tmp12 = vsub_s16(tmp12, tmp13);
|
||||
|
||||
tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
|
||||
tmp3 = vsub_s16(tmp10, tmp13);
|
||||
tmp1 = vadd_s16(tmp11, tmp12);
|
||||
tmp2 = vsub_s16(tmp11, tmp12);
|
||||
|
||||
/* Odd part: dequantize DCT coefficients. */
|
||||
int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
|
||||
int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
|
||||
int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
|
||||
int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
|
||||
|
||||
int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
|
||||
int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
|
||||
int16x4_t z11 = vadd_s16(tmp4, tmp7);
|
||||
int16x4_t z12 = vsub_s16(tmp4, tmp7);
|
||||
|
||||
tmp7 = vadd_s16(z11, z13); /* phase 5 */
|
||||
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
|
||||
tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
|
||||
tmp11 = vadd_s16(tmp11, z11_sub_z13);
|
||||
|
||||
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
|
||||
int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
|
||||
z5 = vadd_s16(z5, z10_add_z12);
|
||||
tmp10 = vqdmulh_lane_s16(z12, consts, 0);
|
||||
tmp10 = vadd_s16(tmp10, z12);
|
||||
tmp10 = vsub_s16(tmp10, z5);
|
||||
tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
|
||||
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
|
||||
tmp12 = vadd_s16(tmp12, z5);
|
||||
|
||||
tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
|
||||
tmp5 = vsub_s16(tmp11, tmp6);
|
||||
tmp4 = vadd_s16(tmp10, tmp5);
|
||||
|
||||
row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
|
||||
row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
|
||||
row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
|
||||
row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
|
||||
row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
|
||||
row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
|
||||
row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
|
||||
row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
|
||||
} else if (right_ac_bitmap == 0) {
|
||||
/* AC coefficients are zero for columns 4, 5, 6, and 7.
|
||||
* Use DC values for these columns.
|
||||
*/
|
||||
int16x4_t dcval = vget_high_s16(row0);
|
||||
|
||||
/* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
|
||||
|
||||
/* Load quantization table. */
|
||||
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
|
||||
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
|
||||
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
|
||||
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
|
||||
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
|
||||
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
|
||||
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
|
||||
|
||||
/* Even part: dequantize DCT coefficients. */
|
||||
int16x4_t tmp0 = vget_low_s16(row0);
|
||||
int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
|
||||
int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
|
||||
int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
|
||||
|
||||
int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
|
||||
int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
|
||||
|
||||
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
|
||||
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
|
||||
int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
|
||||
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
|
||||
tmp12 = vsub_s16(tmp12, tmp13);
|
||||
|
||||
tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
|
||||
tmp3 = vsub_s16(tmp10, tmp13);
|
||||
tmp1 = vadd_s16(tmp11, tmp12);
|
||||
tmp2 = vsub_s16(tmp11, tmp12);
|
||||
|
||||
/* Odd part: dequantize DCT coefficients. */
|
||||
int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
|
||||
int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
|
||||
int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
|
||||
int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
|
||||
|
||||
int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
|
||||
int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
|
||||
int16x4_t z11 = vadd_s16(tmp4, tmp7);
|
||||
int16x4_t z12 = vsub_s16(tmp4, tmp7);
|
||||
|
||||
tmp7 = vadd_s16(z11, z13); /* phase 5 */
|
||||
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
|
||||
tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
|
||||
tmp11 = vadd_s16(tmp11, z11_sub_z13);
|
||||
|
||||
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
|
||||
int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
|
||||
z5 = vadd_s16(z5, z10_add_z12);
|
||||
tmp10 = vqdmulh_lane_s16(z12, consts, 0);
|
||||
tmp10 = vadd_s16(tmp10, z12);
|
||||
tmp10 = vsub_s16(tmp10, z5);
|
||||
tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
|
||||
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
|
||||
tmp12 = vadd_s16(tmp12, z5);
|
||||
|
||||
tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
|
||||
tmp5 = vsub_s16(tmp11, tmp6);
|
||||
tmp4 = vadd_s16(tmp10, tmp5);
|
||||
|
||||
row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
|
||||
row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
|
||||
row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
|
||||
row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
|
||||
row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
|
||||
row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
|
||||
row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
|
||||
row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
|
||||
} else {
|
||||
/* Some AC coefficients are non-zero; full IDCT calculation required. */
|
||||
|
||||
/* Load quantization table. */
|
||||
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
|
||||
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
|
||||
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
|
||||
int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
|
||||
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
|
||||
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
|
||||
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
|
||||
|
||||
/* Even part: dequantize DCT coefficients. */
|
||||
int16x8_t tmp0 = row0;
|
||||
int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
|
||||
int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
|
||||
int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
|
||||
|
||||
int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
|
||||
int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
|
||||
|
||||
int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
|
||||
int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
|
||||
int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
|
||||
tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
|
||||
tmp12 = vsubq_s16(tmp12, tmp13);
|
||||
|
||||
tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
|
||||
tmp3 = vsubq_s16(tmp10, tmp13);
|
||||
tmp1 = vaddq_s16(tmp11, tmp12);
|
||||
tmp2 = vsubq_s16(tmp11, tmp12);
|
||||
|
||||
/* Odd part: dequantize DCT coefficients. */
|
||||
int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
|
||||
int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
|
||||
int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
|
||||
int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
|
||||
|
||||
int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
|
||||
int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
|
||||
int16x8_t z11 = vaddq_s16(tmp4, tmp7);
|
||||
int16x8_t z12 = vsubq_s16(tmp4, tmp7);
|
||||
|
||||
tmp7 = vaddq_s16(z11, z13); /* phase 5 */
|
||||
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
|
||||
tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
|
||||
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
|
||||
|
||||
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
|
||||
int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
|
||||
z5 = vaddq_s16(z5, z10_add_z12);
|
||||
tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
|
||||
tmp10 = vaddq_s16(tmp10, z12);
|
||||
tmp10 = vsubq_s16(tmp10, z5);
|
||||
tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
|
||||
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
|
||||
tmp12 = vaddq_s16(tmp12, z5);
|
||||
|
||||
tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
|
||||
tmp5 = vsubq_s16(tmp11, tmp6);
|
||||
tmp4 = vaddq_s16(tmp10, tmp5);
|
||||
|
||||
row0 = vaddq_s16(tmp0, tmp7);
|
||||
row7 = vsubq_s16(tmp0, tmp7);
|
||||
row1 = vaddq_s16(tmp1, tmp6);
|
||||
row6 = vsubq_s16(tmp1, tmp6);
|
||||
row2 = vaddq_s16(tmp2, tmp5);
|
||||
row5 = vsubq_s16(tmp2, tmp5);
|
||||
row4 = vaddq_s16(tmp3, tmp4);
|
||||
row3 = vsubq_s16(tmp3, tmp4);
|
||||
}
|
||||
|
||||
/* Transpose rows to work on columns in pass 2. */
|
||||
int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
|
||||
int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
|
||||
int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
|
||||
int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
|
||||
|
||||
int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
|
||||
vreinterpretq_s32_s16(rows_45.val[0]));
|
||||
int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
|
||||
vreinterpretq_s32_s16(rows_45.val[1]));
|
||||
int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
|
||||
vreinterpretq_s32_s16(rows_67.val[0]));
|
||||
int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
|
||||
vreinterpretq_s32_s16(rows_67.val[1]));
|
||||
|
||||
int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
|
||||
int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
|
||||
int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
|
||||
int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
|
||||
|
||||
int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
|
||||
int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
|
||||
int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
|
||||
int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
|
||||
int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
|
||||
int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
|
||||
int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
|
||||
int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
|
||||
|
||||
/* 1-D IDCT, pass 2 */
|
||||
|
||||
/* Even part */
|
||||
int16x8_t tmp10 = vaddq_s16(col0, col4);
|
||||
int16x8_t tmp11 = vsubq_s16(col0, col4);
|
||||
|
||||
int16x8_t tmp13 = vaddq_s16(col2, col6);
|
||||
int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
|
||||
int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
|
||||
tmp12 = vaddq_s16(tmp12, col2_sub_col6);
|
||||
tmp12 = vsubq_s16(tmp12, tmp13);
|
||||
|
||||
int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
|
||||
int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
|
||||
int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
|
||||
int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
|
||||
|
||||
/* Odd part */
|
||||
int16x8_t z13 = vaddq_s16(col5, col3);
|
||||
int16x8_t neg_z10 = vsubq_s16(col3, col5);
|
||||
int16x8_t z11 = vaddq_s16(col1, col7);
|
||||
int16x8_t z12 = vsubq_s16(col1, col7);
|
||||
|
||||
int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
|
||||
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
|
||||
tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
|
||||
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
|
||||
|
||||
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
|
||||
int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
|
||||
z5 = vaddq_s16(z5, z10_add_z12);
|
||||
tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
|
||||
tmp10 = vaddq_s16(tmp10, z12);
|
||||
tmp10 = vsubq_s16(tmp10, z5);
|
||||
tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
|
||||
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
|
||||
tmp12 = vaddq_s16(tmp12, z5);
|
||||
|
||||
int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
|
||||
int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
|
||||
int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
|
||||
|
||||
col0 = vaddq_s16(tmp0, tmp7);
|
||||
col7 = vsubq_s16(tmp0, tmp7);
|
||||
col1 = vaddq_s16(tmp1, tmp6);
|
||||
col6 = vsubq_s16(tmp1, tmp6);
|
||||
col2 = vaddq_s16(tmp2, tmp5);
|
||||
col5 = vsubq_s16(tmp2, tmp5);
|
||||
col4 = vaddq_s16(tmp3, tmp4);
|
||||
col3 = vsubq_s16(tmp3, tmp4);
|
||||
|
||||
/* Scale down by a factor of 8, narrowing to 8-bit. */
|
||||
int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
|
||||
vqshrn_n_s16(col1, PASS1_BITS + 3));
|
||||
int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
|
||||
vqshrn_n_s16(col5, PASS1_BITS + 3));
|
||||
int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
|
||||
vqshrn_n_s16(col3, PASS1_BITS + 3));
|
||||
int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
|
||||
vqshrn_n_s16(col7, PASS1_BITS + 3));
|
||||
/* Clamp to range [0-255]. */
|
||||
uint8x16_t cols_01 =
|
||||
vreinterpretq_u8_s8
|
||||
(vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
|
||||
uint8x16_t cols_45 =
|
||||
vreinterpretq_u8_s8
|
||||
(vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
|
||||
uint8x16_t cols_23 =
|
||||
vreinterpretq_u8_s8
|
||||
(vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
|
||||
uint8x16_t cols_67 =
|
||||
vreinterpretq_u8_s8
|
||||
(vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
|
||||
|
||||
/* Transpose block to prepare for store. */
|
||||
uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
|
||||
vreinterpretq_u32_u8(cols_45));
|
||||
uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
|
||||
vreinterpretq_u32_u8(cols_67));
|
||||
|
||||
uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
|
||||
vreinterpretq_u8_u32(cols_0415.val[1]));
|
||||
uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
|
||||
vreinterpretq_u8_u32(cols_2637.val[1]));
|
||||
uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
|
||||
vreinterpretq_u16_u8(cols_2367.val[0]));
|
||||
uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
|
||||
vreinterpretq_u16_u8(cols_2367.val[1]));
|
||||
|
||||
uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
|
||||
uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
|
||||
uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
|
||||
uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
|
||||
|
||||
JSAMPROW outptr0 = output_buf[0] + output_col;
|
||||
JSAMPROW outptr1 = output_buf[1] + output_col;
|
||||
JSAMPROW outptr2 = output_buf[2] + output_col;
|
||||
JSAMPROW outptr3 = output_buf[3] + output_col;
|
||||
JSAMPROW outptr4 = output_buf[4] + output_col;
|
||||
JSAMPROW outptr5 = output_buf[5] + output_col;
|
||||
JSAMPROW outptr6 = output_buf[6] + output_col;
|
||||
JSAMPROW outptr7 = output_buf[7] + output_col;
|
||||
|
||||
/* Store DCT block to memory. */
|
||||
vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
|
||||
vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
|
||||
vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
|
||||
vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
|
||||
vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
|
||||
vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
|
||||
vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
|
||||
vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
|
||||
}
|
||||
801
android/extern/libjpeg-turbo/simd/arm/jidctint-neon.c
vendored
Normal file
801
android/extern/libjpeg-turbo/simd/arm/jidctint-neon.c
vendored
Normal file
@@ -0,0 +1,801 @@
|
||||
/*
|
||||
* jidctint-neon.c - accurate integer IDCT (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
#include "neon-compat.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
|
||||
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
|
||||
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
|
||||
|
||||
/* The computation of the inverse DCT requires the use of constants known at
|
||||
* compile time. Scaled integer constants are used to avoid floating-point
|
||||
* arithmetic:
|
||||
* 0.298631336 = 2446 * 2^-13
|
||||
* 0.390180644 = 3196 * 2^-13
|
||||
* 0.541196100 = 4433 * 2^-13
|
||||
* 0.765366865 = 6270 * 2^-13
|
||||
* 0.899976223 = 7373 * 2^-13
|
||||
* 1.175875602 = 9633 * 2^-13
|
||||
* 1.501321110 = 12299 * 2^-13
|
||||
* 1.847759065 = 15137 * 2^-13
|
||||
* 1.961570560 = 16069 * 2^-13
|
||||
* 2.053119869 = 16819 * 2^-13
|
||||
* 2.562915447 = 20995 * 2^-13
|
||||
* 3.072711026 = 25172 * 2^-13
|
||||
*/
|
||||
|
||||
#define F_0_298 2446
|
||||
#define F_0_390 3196
|
||||
#define F_0_541 4433
|
||||
#define F_0_765 6270
|
||||
#define F_0_899 7373
|
||||
#define F_1_175 9633
|
||||
#define F_1_501 12299
|
||||
#define F_1_847 15137
|
||||
#define F_1_961 16069
|
||||
#define F_2_053 16819
|
||||
#define F_2_562 20995
|
||||
#define F_3_072 25172
|
||||
|
||||
#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
|
||||
#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
|
||||
#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
|
||||
#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
|
||||
#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
|
||||
#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
|
||||
#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
|
||||
#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
|
||||
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
|
||||
F_0_899, F_0_541,
|
||||
F_2_562, F_0_298_MINUS_0_899,
|
||||
F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
|
||||
F_0_541_PLUS_0_765, F_1_175,
|
||||
F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
|
||||
F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
|
||||
0, 0, 0, 0
|
||||
};
|
||||
|
||||
|
||||
/* Forward declaration of regular and sparse IDCT helper functions */
|
||||
|
||||
static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
|
||||
int16x4_t row1,
|
||||
int16x4_t row2,
|
||||
int16x4_t row3,
|
||||
int16x4_t row4,
|
||||
int16x4_t row5,
|
||||
int16x4_t row6,
|
||||
int16x4_t row7,
|
||||
int16x4_t quant_row0,
|
||||
int16x4_t quant_row1,
|
||||
int16x4_t quant_row2,
|
||||
int16x4_t quant_row3,
|
||||
int16x4_t quant_row4,
|
||||
int16x4_t quant_row5,
|
||||
int16x4_t quant_row6,
|
||||
int16x4_t quant_row7,
|
||||
int16_t *workspace_1,
|
||||
int16_t *workspace_2);
|
||||
|
||||
static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
|
||||
int16x4_t row1,
|
||||
int16x4_t row2,
|
||||
int16x4_t row3,
|
||||
int16x4_t quant_row0,
|
||||
int16x4_t quant_row1,
|
||||
int16x4_t quant_row2,
|
||||
int16x4_t quant_row3,
|
||||
int16_t *workspace_1,
|
||||
int16_t *workspace_2);
|
||||
|
||||
static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col,
|
||||
unsigned buf_offset);
|
||||
|
||||
static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col,
|
||||
unsigned buf_offset);
|
||||
|
||||
|
||||
/* Perform dequantization and inverse DCT on one block of coefficients. For
|
||||
* reference, the C implementation (jpeg_idct_slow()) can be found in
|
||||
* jidctint.c.
|
||||
*
|
||||
* Optimization techniques used for fast data access:
|
||||
*
|
||||
* In each pass, the inverse DCT is computed for the left and right 4x8 halves
|
||||
* of the DCT block. This avoids spilling due to register pressure, and the
|
||||
* increased granularity allows for an optimized calculation depending on the
|
||||
* values of the DCT coefficients. Between passes, intermediate data is stored
|
||||
* in 4x8 workspace buffers.
|
||||
*
|
||||
* Transposing the 8x8 DCT block after each pass can be achieved by transposing
|
||||
* each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
|
||||
* diagram below.) Swapping quadrants is cheap, since the second pass can just
|
||||
* swap the workspace buffer pointers.
|
||||
*
|
||||
* +-------+-------+ +-------+-------+
|
||||
* | | | | | |
|
||||
* | 0 | 1 | | 0 | 2 |
|
||||
* | | | transpose | | |
|
||||
* +-------+-------+ ------> +-------+-------+
|
||||
* | | | | | |
|
||||
* | 2 | 3 | | 1 | 3 |
|
||||
* | | | | | |
|
||||
* +-------+-------+ +-------+-------+
|
||||
*
|
||||
* Optimization techniques used to accelerate the inverse DCT calculation:
|
||||
*
|
||||
* In a DCT coefficient block, the coefficients are increasingly likely to be 0
|
||||
* as you move diagonally from top left to bottom right. If whole rows of
|
||||
* coefficients are 0, then the inverse DCT calculation can be simplified. On
|
||||
* the first pass of the inverse DCT, we test for three special cases before
|
||||
* defaulting to a full "regular" inverse DCT:
|
||||
*
|
||||
* 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
|
||||
* "sparse" simplified inverse DCT on rows 0-3.
|
||||
* 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
|
||||
* result is equal to the dequantized DC coefficients.
|
||||
* 3) AC and DC coefficients are all zero. In this case, the inverse DCT
|
||||
* result is all zero. For the left 4x8 half, this is handled identically
|
||||
* to Case 2 above. For the right 4x8 half, we do no work and signal that
|
||||
* the "sparse" algorithm is required for the second pass.
|
||||
*
|
||||
* In the second pass, only a single special case is tested: whether the AC and
|
||||
* DC coefficients were all zero in the right 4x8 block during the first pass
|
||||
* (refer to Case 3 above.) If this is the case, then a "sparse" variant of
|
||||
* the second pass is performed for both the left and right halves of the DCT
|
||||
* block. (The transposition after the first pass means that the right 4x8
|
||||
* block during the first pass becomes rows 4-7 during the second pass.)
|
||||
*/
|
||||
|
||||
void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
ISLOW_MULT_TYPE *quantptr = dct_table;
|
||||
|
||||
int16_t workspace_l[8 * DCTSIZE / 2];
|
||||
int16_t workspace_r[8 * DCTSIZE / 2];
|
||||
|
||||
/* Compute IDCT first pass on left 4x8 coefficient block. */
|
||||
|
||||
/* Load DCT coefficients in left 4x8 block. */
|
||||
int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
|
||||
int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
|
||||
int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
|
||||
int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
|
||||
int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
|
||||
int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
|
||||
int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
|
||||
int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
|
||||
|
||||
/* Load quantization table for left 4x8 block. */
|
||||
int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
|
||||
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
|
||||
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
|
||||
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
|
||||
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
|
||||
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
|
||||
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
|
||||
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
|
||||
|
||||
/* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
|
||||
int16x4_t bitmap = vorr_s16(row7, row6);
|
||||
bitmap = vorr_s16(bitmap, row5);
|
||||
bitmap = vorr_s16(bitmap, row4);
|
||||
int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
|
||||
|
||||
if (bitmap_rows_4567 == 0) {
|
||||
bitmap = vorr_s16(bitmap, row3);
|
||||
bitmap = vorr_s16(bitmap, row2);
|
||||
bitmap = vorr_s16(bitmap, row1);
|
||||
int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
|
||||
|
||||
if (left_ac_bitmap == 0) {
|
||||
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
|
||||
int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
|
||||
/* Store 4x4 blocks to workspace, transposing in the process. */
|
||||
vst4_s16(workspace_l, quadrant);
|
||||
vst4_s16(workspace_r, quadrant);
|
||||
} else {
|
||||
jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
|
||||
quant_row1, quant_row2, quant_row3,
|
||||
workspace_l, workspace_r);
|
||||
}
|
||||
} else {
|
||||
jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
|
||||
row6, row7, quant_row0, quant_row1,
|
||||
quant_row2, quant_row3, quant_row4,
|
||||
quant_row5, quant_row6, quant_row7,
|
||||
workspace_l, workspace_r);
|
||||
}
|
||||
|
||||
/* Compute IDCT first pass on right 4x8 coefficient block. */
|
||||
|
||||
/* Load DCT coefficients in right 4x8 block. */
|
||||
row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
|
||||
row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
|
||||
row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
|
||||
row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
|
||||
row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
|
||||
row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
|
||||
row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
|
||||
row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
|
||||
|
||||
/* Load quantization table for right 4x8 block. */
|
||||
quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
|
||||
quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
|
||||
quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
|
||||
quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
|
||||
quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
|
||||
quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
|
||||
quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
|
||||
quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
|
||||
|
||||
/* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
|
||||
bitmap = vorr_s16(row7, row6);
|
||||
bitmap = vorr_s16(bitmap, row5);
|
||||
bitmap = vorr_s16(bitmap, row4);
|
||||
bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
|
||||
bitmap = vorr_s16(bitmap, row3);
|
||||
bitmap = vorr_s16(bitmap, row2);
|
||||
bitmap = vorr_s16(bitmap, row1);
|
||||
int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
|
||||
|
||||
/* If this remains non-zero, a "regular" second pass will be performed. */
|
||||
int64_t right_ac_dc_bitmap = 1;
|
||||
|
||||
if (right_ac_bitmap == 0) {
|
||||
bitmap = vorr_s16(bitmap, row0);
|
||||
right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
|
||||
|
||||
if (right_ac_dc_bitmap != 0) {
|
||||
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
|
||||
int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
|
||||
/* Store 4x4 blocks to workspace, transposing in the process. */
|
||||
vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
|
||||
vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
|
||||
}
|
||||
} else {
|
||||
if (bitmap_rows_4567 == 0) {
|
||||
jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
|
||||
quant_row1, quant_row2, quant_row3,
|
||||
workspace_l + 4 * DCTSIZE / 2,
|
||||
workspace_r + 4 * DCTSIZE / 2);
|
||||
} else {
|
||||
jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
|
||||
row6, row7, quant_row0, quant_row1,
|
||||
quant_row2, quant_row3, quant_row4,
|
||||
quant_row5, quant_row6, quant_row7,
|
||||
workspace_l + 4 * DCTSIZE / 2,
|
||||
workspace_r + 4 * DCTSIZE / 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* Second pass: compute IDCT on rows in workspace. */
|
||||
|
||||
/* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
|
||||
if (right_ac_dc_bitmap == 0) {
|
||||
jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
|
||||
jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
|
||||
} else {
|
||||
jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
|
||||
jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Perform dequantization and the first pass of the accurate inverse DCT on a
|
||||
* 4x8 block of coefficients. (To process the full 8x8 DCT block, this
|
||||
* function-- or some other optimized variant-- needs to be called for both the
|
||||
* left and right 4x8 blocks.)
|
||||
*
|
||||
* This "regular" version assumes that no optimization can be made to the IDCT
|
||||
* calculation, since no useful set of AC coefficients is all 0.
|
||||
*
|
||||
* The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
|
||||
* found in jidctint.c. Algorithmic changes made here are documented inline.
|
||||
*/
|
||||
|
||||
static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
|
||||
int16x4_t row1,
|
||||
int16x4_t row2,
|
||||
int16x4_t row3,
|
||||
int16x4_t row4,
|
||||
int16x4_t row5,
|
||||
int16x4_t row6,
|
||||
int16x4_t row7,
|
||||
int16x4_t quant_row0,
|
||||
int16x4_t quant_row1,
|
||||
int16x4_t quant_row2,
|
||||
int16x4_t quant_row3,
|
||||
int16x4_t quant_row4,
|
||||
int16x4_t quant_row5,
|
||||
int16x4_t quant_row6,
|
||||
int16x4_t quant_row7,
|
||||
int16_t *workspace_1,
|
||||
int16_t *workspace_2)
|
||||
{
|
||||
/* Load constants for IDCT computation. */
|
||||
#ifdef HAVE_VLD1_S16_X3
|
||||
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
|
||||
#else
|
||||
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
|
||||
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
|
||||
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
|
||||
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
|
||||
#endif
|
||||
|
||||
/* Even part */
|
||||
int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
|
||||
int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
|
||||
|
||||
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
|
||||
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
|
||||
tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
|
||||
|
||||
z2_s16 = vmul_s16(row0, quant_row0);
|
||||
z3_s16 = vmul_s16(row4, quant_row4);
|
||||
|
||||
int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
|
||||
int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
|
||||
|
||||
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
|
||||
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
|
||||
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
|
||||
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
|
||||
|
||||
/* Odd part */
|
||||
int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
|
||||
int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
|
||||
int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
|
||||
int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
|
||||
|
||||
z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
|
||||
int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
|
||||
|
||||
/* Implementation as per jpeg_idct_islow() in jidctint.c:
|
||||
* z5 = (z3 + z4) * 1.175875602;
|
||||
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
* z3 += z5; z4 += z5;
|
||||
*
|
||||
* This implementation:
|
||||
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
*/
|
||||
|
||||
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
|
||||
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
|
||||
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
|
||||
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
|
||||
|
||||
/* Implementation as per jpeg_idct_islow() in jidctint.c:
|
||||
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
* tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
* tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
*
|
||||
* This implementation:
|
||||
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
* tmp0 += z3; tmp1 += z4;
|
||||
* tmp2 += z3; tmp3 += z4;
|
||||
*/
|
||||
|
||||
tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
|
||||
tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
|
||||
tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
|
||||
tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
|
||||
|
||||
tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
|
||||
tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
|
||||
tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
|
||||
tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
|
||||
|
||||
tmp0 = vaddq_s32(tmp0, z3);
|
||||
tmp1 = vaddq_s32(tmp1, z4);
|
||||
tmp2 = vaddq_s32(tmp2, z3);
|
||||
tmp3 = vaddq_s32(tmp3, z4);
|
||||
|
||||
/* Final output stage: descale and narrow to 16-bit. */
|
||||
int16x4x4_t rows_0123 = { {
|
||||
vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
|
||||
vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
|
||||
vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
|
||||
vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
|
||||
} };
|
||||
int16x4x4_t rows_4567 = { {
|
||||
vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
|
||||
vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
|
||||
vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
|
||||
vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
|
||||
} };
|
||||
|
||||
/* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
|
||||
* (VST4 transposes the blocks. We need to operate on rows in the next
|
||||
* pass.)
|
||||
*/
|
||||
vst4_s16(workspace_1, rows_0123);
|
||||
vst4_s16(workspace_2, rows_4567);
|
||||
}
|
||||
|
||||
|
||||
/* Perform dequantization and the first pass of the accurate inverse DCT on a
|
||||
* 4x8 block of coefficients.
|
||||
*
|
||||
* This "sparse" version assumes that the AC coefficients in rows 4-7 are all
|
||||
* 0. This simplifies the IDCT calculation, accelerating overall performance.
|
||||
*/
|
||||
|
||||
static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
|
||||
int16x4_t row1,
|
||||
int16x4_t row2,
|
||||
int16x4_t row3,
|
||||
int16x4_t quant_row0,
|
||||
int16x4_t quant_row1,
|
||||
int16x4_t quant_row2,
|
||||
int16x4_t quant_row3,
|
||||
int16_t *workspace_1,
|
||||
int16_t *workspace_2)
|
||||
{
|
||||
/* Load constants for IDCT computation. */
|
||||
#ifdef HAVE_VLD1_S16_X3
|
||||
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
|
||||
#else
|
||||
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
|
||||
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
|
||||
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
|
||||
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
|
||||
#endif
|
||||
|
||||
/* Even part (z3 is all 0) */
|
||||
int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
|
||||
|
||||
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
|
||||
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
|
||||
|
||||
z2_s16 = vmul_s16(row0, quant_row0);
|
||||
int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
|
||||
int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
|
||||
|
||||
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
|
||||
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
|
||||
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
|
||||
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
|
||||
|
||||
/* Odd part (tmp0 and tmp1 are both all 0) */
|
||||
int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
|
||||
int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
|
||||
|
||||
int16x4_t z3_s16 = tmp2_s16;
|
||||
int16x4_t z4_s16 = tmp3_s16;
|
||||
|
||||
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
|
||||
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
|
||||
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
|
||||
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
|
||||
|
||||
tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
|
||||
tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
|
||||
tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
|
||||
tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
|
||||
|
||||
/* Final output stage: descale and narrow to 16-bit. */
|
||||
int16x4x4_t rows_0123 = { {
|
||||
vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
|
||||
vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
|
||||
vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
|
||||
vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
|
||||
} };
|
||||
int16x4x4_t rows_4567 = { {
|
||||
vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
|
||||
vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
|
||||
vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
|
||||
vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
|
||||
} };
|
||||
|
||||
/* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
|
||||
* (VST4 transposes the blocks. We need to operate on rows in the next
|
||||
* pass.)
|
||||
*/
|
||||
vst4_s16(workspace_1, rows_0123);
|
||||
vst4_s16(workspace_2, rows_4567);
|
||||
}
|
||||
|
||||
|
||||
/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
|
||||
* coefficients. (To process the full 8x8 DCT block, this function-- or some
|
||||
* other optimized variant-- needs to be called for both the right and left 4x8
|
||||
* blocks.)
|
||||
*
|
||||
* This "regular" version assumes that no optimization can be made to the IDCT
|
||||
* calculation, since no useful set of coefficient values are all 0 after the
|
||||
* first pass.
|
||||
*
|
||||
* Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
|
||||
* can be found in jidctint.c. Algorithmic changes made here are documented
|
||||
* inline.
|
||||
*/
|
||||
|
||||
static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col,
|
||||
unsigned buf_offset)
|
||||
{
|
||||
/* Load constants for IDCT computation. */
|
||||
#ifdef HAVE_VLD1_S16_X3
|
||||
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
|
||||
#else
|
||||
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
|
||||
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
|
||||
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
|
||||
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
|
||||
#endif
|
||||
|
||||
/* Even part */
|
||||
int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
|
||||
int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
|
||||
|
||||
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
|
||||
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
|
||||
tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
|
||||
|
||||
z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
|
||||
z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
|
||||
|
||||
int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
|
||||
int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
|
||||
|
||||
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
|
||||
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
|
||||
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
|
||||
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
|
||||
|
||||
/* Odd part */
|
||||
int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
|
||||
int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
|
||||
int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
|
||||
int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
|
||||
|
||||
z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
|
||||
int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
|
||||
|
||||
/* Implementation as per jpeg_idct_islow() in jidctint.c:
|
||||
* z5 = (z3 + z4) * 1.175875602;
|
||||
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
* z3 += z5; z4 += z5;
|
||||
*
|
||||
* This implementation:
|
||||
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
*/
|
||||
|
||||
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
|
||||
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
|
||||
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
|
||||
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
|
||||
|
||||
/* Implementation as per jpeg_idct_islow() in jidctint.c:
|
||||
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
* tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
* tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
*
|
||||
* This implementation:
|
||||
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
* tmp0 += z3; tmp1 += z4;
|
||||
* tmp2 += z3; tmp3 += z4;
|
||||
*/
|
||||
|
||||
tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
|
||||
tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
|
||||
tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
|
||||
tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
|
||||
|
||||
tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
|
||||
tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
|
||||
tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
|
||||
tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
|
||||
|
||||
tmp0 = vaddq_s32(tmp0, z3);
|
||||
tmp1 = vaddq_s32(tmp1, z4);
|
||||
tmp2 = vaddq_s32(tmp2, z3);
|
||||
tmp3 = vaddq_s32(tmp3, z4);
|
||||
|
||||
/* Final output stage: descale and narrow to 16-bit. */
|
||||
int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
|
||||
vaddhn_s32(tmp12, tmp1));
|
||||
int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
|
||||
vaddhn_s32(tmp13, tmp0));
|
||||
int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
|
||||
vsubhn_s32(tmp11, tmp2));
|
||||
int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
|
||||
vsubhn_s32(tmp10, tmp3));
|
||||
/* Descale and narrow to 8-bit. */
|
||||
int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
|
||||
int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
|
||||
int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
|
||||
int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
|
||||
/* Clamp to range [0-255]. */
|
||||
uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
|
||||
vdup_n_u8(CENTERJSAMPLE));
|
||||
uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
|
||||
vdup_n_u8(CENTERJSAMPLE));
|
||||
uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
|
||||
vdup_n_u8(CENTERJSAMPLE));
|
||||
uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
|
||||
vdup_n_u8(CENTERJSAMPLE));
|
||||
|
||||
/* Transpose 4x8 block and store to memory. (Zipping adjacent columns
|
||||
* together allows us to store 16-bit elements.)
|
||||
*/
|
||||
uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
|
||||
uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
|
||||
uint16x4x4_t cols_01_23_45_67 = { {
|
||||
vreinterpret_u16_u8(cols_01_23.val[0]),
|
||||
vreinterpret_u16_u8(cols_01_23.val[1]),
|
||||
vreinterpret_u16_u8(cols_45_67.val[0]),
|
||||
vreinterpret_u16_u8(cols_45_67.val[1])
|
||||
} };
|
||||
|
||||
JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
|
||||
JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
|
||||
JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
|
||||
JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
|
||||
/* VST4 of 16-bit elements completes the transpose. */
|
||||
vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
|
||||
vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
|
||||
vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
|
||||
vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
|
||||
}
|
||||
|
||||
|
||||
/* Performs the second pass of the accurate inverse DCT on a 4x8 block
|
||||
* of coefficients.
|
||||
*
|
||||
* This "sparse" version assumes that the coefficient values (after the first
|
||||
* pass) in rows 4-7 are all 0. This simplifies the IDCT calculation,
|
||||
* accelerating overall performance.
|
||||
*/
|
||||
|
||||
static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col,
|
||||
unsigned buf_offset)
|
||||
{
|
||||
/* Load constants for IDCT computation. */
|
||||
#ifdef HAVE_VLD1_S16_X3
|
||||
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
|
||||
#else
|
||||
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
|
||||
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
|
||||
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
|
||||
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
|
||||
#endif
|
||||
|
||||
/* Even part (z3 is all 0) */
|
||||
int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
|
||||
|
||||
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
|
||||
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
|
||||
|
||||
z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
|
||||
int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
|
||||
int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
|
||||
|
||||
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
|
||||
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
|
||||
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
|
||||
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
|
||||
|
||||
/* Odd part (tmp0 and tmp1 are both all 0) */
|
||||
int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
|
||||
int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
|
||||
|
||||
int16x4_t z3_s16 = tmp2_s16;
|
||||
int16x4_t z4_s16 = tmp3_s16;
|
||||
|
||||
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
|
||||
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
|
||||
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
|
||||
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
|
||||
|
||||
tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
|
||||
tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
|
||||
tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
|
||||
tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
|
||||
|
||||
/* Final output stage: descale and narrow to 16-bit. */
|
||||
int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
|
||||
vaddhn_s32(tmp12, tmp1));
|
||||
int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
|
||||
vaddhn_s32(tmp13, tmp0));
|
||||
int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
|
||||
vsubhn_s32(tmp11, tmp2));
|
||||
int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
|
||||
vsubhn_s32(tmp10, tmp3));
|
||||
/* Descale and narrow to 8-bit. */
|
||||
int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
|
||||
int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
|
||||
int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
|
||||
int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
|
||||
/* Clamp to range [0-255]. */
|
||||
uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
|
||||
vdup_n_u8(CENTERJSAMPLE));
|
||||
uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
|
||||
vdup_n_u8(CENTERJSAMPLE));
|
||||
uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
|
||||
vdup_n_u8(CENTERJSAMPLE));
|
||||
uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
|
||||
vdup_n_u8(CENTERJSAMPLE));
|
||||
|
||||
/* Transpose 4x8 block and store to memory. (Zipping adjacent columns
|
||||
* together allows us to store 16-bit elements.)
|
||||
*/
|
||||
uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
|
||||
uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
|
||||
uint16x4x4_t cols_01_23_45_67 = { {
|
||||
vreinterpret_u16_u8(cols_01_23.val[0]),
|
||||
vreinterpret_u16_u8(cols_01_23.val[1]),
|
||||
vreinterpret_u16_u8(cols_45_67.val[0]),
|
||||
vreinterpret_u16_u8(cols_45_67.val[1])
|
||||
} };
|
||||
|
||||
JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
|
||||
JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
|
||||
JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
|
||||
JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
|
||||
/* VST4 of 16-bit elements completes the transpose. */
|
||||
vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
|
||||
vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
|
||||
vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
|
||||
vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
|
||||
}
|
||||
486
android/extern/libjpeg-turbo/simd/arm/jidctred-neon.c
vendored
Normal file
486
android/extern/libjpeg-turbo/simd/arm/jidctred-neon.c
vendored
Normal file
@@ -0,0 +1,486 @@
|
||||
/*
|
||||
* jidctred-neon.c - reduced-size IDCT (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
#include "neon-compat.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
#define CONST_BITS 13
|
||||
#define PASS1_BITS 2
|
||||
|
||||
#define F_0_211 1730
|
||||
#define F_0_509 4176
|
||||
#define F_0_601 4926
|
||||
#define F_0_720 5906
|
||||
#define F_0_765 6270
|
||||
#define F_0_850 6967
|
||||
#define F_0_899 7373
|
||||
#define F_1_061 8697
|
||||
#define F_1_272 10426
|
||||
#define F_1_451 11893
|
||||
#define F_1_847 15137
|
||||
#define F_2_172 17799
|
||||
#define F_2_562 20995
|
||||
#define F_3_624 29692
|
||||
|
||||
|
||||
/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
|
||||
* 2x2 output from an 8x8 DCT block. It uses the same calculations and
|
||||
* produces exactly the same output as IJG's original jpeg_idct_2x2() function
|
||||
* from jpeg-6b, which can be found in jidctred.c.
|
||||
*
|
||||
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||
* 0.720959822 = 5906 * 2^-13
|
||||
* 0.850430095 = 6967 * 2^-13
|
||||
* 1.272758580 = 10426 * 2^-13
|
||||
* 3.624509785 = 29692 * 2^-13
|
||||
*
|
||||
* See jidctred.c for further details of the 2x2 IDCT algorithm. Where
|
||||
* possible, the variable names and comments here in jsimd_idct_2x2_neon()
|
||||
* match up with those in jpeg_idct_2x2().
|
||||
*/
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
|
||||
-F_0_720, F_0_850, -F_1_272, F_3_624
|
||||
};
|
||||
|
||||
void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
ISLOW_MULT_TYPE *quantptr = dct_table;
|
||||
|
||||
/* Load DCT coefficients. */
|
||||
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
|
||||
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
|
||||
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
|
||||
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
|
||||
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
|
||||
|
||||
/* Load quantization table values. */
|
||||
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
|
||||
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
|
||||
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
|
||||
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
|
||||
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
|
||||
|
||||
/* Dequantize DCT coefficients. */
|
||||
row0 = vmulq_s16(row0, quant_row0);
|
||||
row1 = vmulq_s16(row1, quant_row1);
|
||||
row3 = vmulq_s16(row3, quant_row3);
|
||||
row5 = vmulq_s16(row5, quant_row5);
|
||||
row7 = vmulq_s16(row7, quant_row7);
|
||||
|
||||
/* Load IDCT conversion constants. */
|
||||
const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
|
||||
|
||||
/* Pass 1: process columns from input, put results in vectors row0 and
|
||||
* row1.
|
||||
*/
|
||||
|
||||
/* Even part */
|
||||
int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
|
||||
int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
|
||||
|
||||
/* Odd part */
|
||||
int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
|
||||
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
|
||||
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
|
||||
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
|
||||
int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
|
||||
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
|
||||
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
|
||||
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
|
||||
|
||||
/* Final output stage: descale and narrow to 16-bit. */
|
||||
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
|
||||
vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
|
||||
row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
|
||||
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
|
||||
|
||||
/* Transpose two rows, ready for second pass. */
|
||||
int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
|
||||
int16x8_t cols_0246 = cols_0246_1357.val[0];
|
||||
int16x8_t cols_1357 = cols_0246_1357.val[1];
|
||||
/* Duplicate columns such that each is accessible in its own vector. */
|
||||
int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
|
||||
vreinterpretq_s32_s16(cols_1357));
|
||||
int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
|
||||
int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
|
||||
|
||||
/* Pass 2: process two rows, store to output array. */
|
||||
|
||||
/* Even part: we're only interested in col0; the top half of tmp10 is "don't
|
||||
* care."
|
||||
*/
|
||||
int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
|
||||
|
||||
/* Odd part: we're only interested in the bottom half of tmp0. */
|
||||
int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
|
||||
tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
|
||||
tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
|
||||
tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
|
||||
|
||||
/* Final output stage: descale and clamp to range [0-255]. */
|
||||
int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
|
||||
vsubhn_s32(tmp10, tmp0));
|
||||
output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
|
||||
CONST_BITS + PASS1_BITS + 3 + 2 - 16);
|
||||
/* Narrow to 8-bit and convert to unsigned. */
|
||||
uint8x8_t output_u8 = vqmovun_s16(output_s16);
|
||||
|
||||
/* Store 2x2 block to memory. */
|
||||
vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
|
||||
vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
|
||||
vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
|
||||
vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
|
||||
}
|
||||
|
||||
|
||||
/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
|
||||
* 4x4 output from an 8x8 DCT block. It uses the same calculations and
|
||||
* produces exactly the same output as IJG's original jpeg_idct_4x4() function
|
||||
* from jpeg-6b, which can be found in jidctred.c.
|
||||
*
|
||||
* Scaled integer constants are used to avoid floating-point arithmetic:
|
||||
* 0.211164243 = 1730 * 2^-13
|
||||
* 0.509795579 = 4176 * 2^-13
|
||||
* 0.601344887 = 4926 * 2^-13
|
||||
* 0.765366865 = 6270 * 2^-13
|
||||
* 0.899976223 = 7373 * 2^-13
|
||||
* 1.061594337 = 8697 * 2^-13
|
||||
* 1.451774981 = 11893 * 2^-13
|
||||
* 1.847759065 = 15137 * 2^-13
|
||||
* 2.172734803 = 17799 * 2^-13
|
||||
* 2.562915447 = 20995 * 2^-13
|
||||
*
|
||||
* See jidctred.c for further details of the 4x4 IDCT algorithm. Where
|
||||
* possible, the variable names and comments here in jsimd_idct_4x4_neon()
|
||||
* match up with those in jpeg_idct_4x4().
|
||||
*/
|
||||
|
||||
ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
|
||||
F_1_847, -F_0_765, -F_0_211, F_1_451,
|
||||
-F_2_172, F_1_061, -F_0_509, -F_0_601,
|
||||
F_0_899, F_2_562, 0, 0
|
||||
};
|
||||
|
||||
void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
{
|
||||
ISLOW_MULT_TYPE *quantptr = dct_table;
|
||||
|
||||
/* Load DCT coefficients. */
|
||||
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
|
||||
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
|
||||
int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
|
||||
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
|
||||
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
|
||||
int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
|
||||
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
|
||||
|
||||
/* Load quantization table values for DC coefficients. */
|
||||
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
|
||||
/* Dequantize DC coefficients. */
|
||||
row0 = vmulq_s16(row0, quant_row0);
|
||||
|
||||
/* Construct bitmap to test if all AC coefficients are 0. */
|
||||
int16x8_t bitmap = vorrq_s16(row1, row2);
|
||||
bitmap = vorrq_s16(bitmap, row3);
|
||||
bitmap = vorrq_s16(bitmap, row5);
|
||||
bitmap = vorrq_s16(bitmap, row6);
|
||||
bitmap = vorrq_s16(bitmap, row7);
|
||||
|
||||
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
|
||||
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
|
||||
|
||||
/* Load constants for IDCT computation. */
|
||||
#ifdef HAVE_VLD1_S16_X3
|
||||
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
|
||||
#else
|
||||
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */
|
||||
const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
|
||||
const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
|
||||
const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
|
||||
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
|
||||
#endif
|
||||
|
||||
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
|
||||
/* All AC coefficients are zero.
|
||||
* Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
|
||||
*/
|
||||
int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
|
||||
row0 = dcval;
|
||||
row1 = dcval;
|
||||
row2 = dcval;
|
||||
row3 = dcval;
|
||||
} else if (left_ac_bitmap == 0) {
|
||||
/* AC coefficients are zero for columns 0, 1, 2, and 3.
|
||||
* Compute DC values for these columns.
|
||||
*/
|
||||
int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
|
||||
|
||||
/* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
|
||||
|
||||
/* Load quantization table. */
|
||||
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
|
||||
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
|
||||
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
|
||||
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
|
||||
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
|
||||
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
|
||||
|
||||
/* Even part */
|
||||
int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
|
||||
|
||||
int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
|
||||
int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
|
||||
|
||||
int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
|
||||
|
||||
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
|
||||
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
|
||||
|
||||
/* Odd part */
|
||||
int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
|
||||
z2 = vmul_s16(vget_high_s16(row5), quant_row5);
|
||||
z3 = vmul_s16(vget_high_s16(row3), quant_row3);
|
||||
int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
|
||||
|
||||
tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
|
||||
tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
|
||||
tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
|
||||
tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
|
||||
|
||||
tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
|
||||
|
||||
/* Final output stage: descale and narrow to 16-bit. */
|
||||
row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
|
||||
CONST_BITS - PASS1_BITS + 1));
|
||||
row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
|
||||
CONST_BITS - PASS1_BITS + 1));
|
||||
row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
|
||||
CONST_BITS - PASS1_BITS + 1));
|
||||
row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
|
||||
CONST_BITS - PASS1_BITS + 1));
|
||||
} else if (right_ac_bitmap == 0) {
|
||||
/* AC coefficients are zero for columns 4, 5, 6, and 7.
|
||||
* Compute DC values for these columns.
|
||||
*/
|
||||
int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
|
||||
|
||||
/* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
|
||||
|
||||
/* Load quantization table. */
|
||||
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
|
||||
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
|
||||
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
|
||||
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
|
||||
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
|
||||
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
|
||||
|
||||
/* Even part */
|
||||
int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
|
||||
|
||||
int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
|
||||
int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
|
||||
|
||||
int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
|
||||
|
||||
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
|
||||
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
|
||||
|
||||
/* Odd part */
|
||||
int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
|
||||
z2 = vmul_s16(vget_low_s16(row5), quant_row5);
|
||||
z3 = vmul_s16(vget_low_s16(row3), quant_row3);
|
||||
int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
|
||||
|
||||
tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
|
||||
tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
|
||||
tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
|
||||
tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
|
||||
|
||||
tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
|
||||
tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
|
||||
|
||||
/* Final output stage: descale and narrow to 16-bit. */
|
||||
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
|
||||
CONST_BITS - PASS1_BITS + 1), dcval);
|
||||
row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
|
||||
CONST_BITS - PASS1_BITS + 1), dcval);
|
||||
row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
|
||||
CONST_BITS - PASS1_BITS + 1), dcval);
|
||||
row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
|
||||
CONST_BITS - PASS1_BITS + 1), dcval);
|
||||
} else {
|
||||
/* All AC coefficients are non-zero; full IDCT calculation required. */
|
||||
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
|
||||
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
|
||||
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
|
||||
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
|
||||
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
|
||||
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
|
||||
|
||||
/* Even part */
|
||||
int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
|
||||
int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
|
||||
|
||||
int16x8_t z2 = vmulq_s16(row2, quant_row2);
|
||||
int16x8_t z3 = vmulq_s16(row6, quant_row6);
|
||||
|
||||
int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
|
||||
int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
|
||||
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
|
||||
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
|
||||
|
||||
int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
|
||||
int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
|
||||
int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
|
||||
int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
|
||||
|
||||
/* Odd part */
|
||||
int16x8_t z1 = vmulq_s16(row7, quant_row7);
|
||||
z2 = vmulq_s16(row5, quant_row5);
|
||||
z3 = vmulq_s16(row3, quant_row3);
|
||||
int16x8_t z4 = vmulq_s16(row1, quant_row1);
|
||||
|
||||
tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
|
||||
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
|
||||
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
|
||||
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
|
||||
tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
|
||||
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
|
||||
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
|
||||
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
|
||||
|
||||
tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
|
||||
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
|
||||
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
|
||||
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
|
||||
tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
|
||||
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
|
||||
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
|
||||
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
|
||||
|
||||
/* Final output stage: descale and narrow to 16-bit. */
|
||||
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
|
||||
CONST_BITS - PASS1_BITS + 1),
|
||||
vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
|
||||
CONST_BITS - PASS1_BITS + 1));
|
||||
row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
|
||||
CONST_BITS - PASS1_BITS + 1),
|
||||
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
|
||||
CONST_BITS - PASS1_BITS + 1));
|
||||
row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
|
||||
CONST_BITS - PASS1_BITS + 1),
|
||||
vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
|
||||
CONST_BITS - PASS1_BITS + 1));
|
||||
row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
|
||||
CONST_BITS - PASS1_BITS + 1),
|
||||
vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
|
||||
CONST_BITS - PASS1_BITS + 1));
|
||||
}
|
||||
|
||||
/* Transpose 8x4 block to perform IDCT on rows in second pass. */
|
||||
int16x8x2_t row_01 = vtrnq_s16(row0, row1);
|
||||
int16x8x2_t row_23 = vtrnq_s16(row2, row3);
|
||||
|
||||
int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
|
||||
vreinterpretq_s32_s16(row_23.val[0]));
|
||||
int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
|
||||
vreinterpretq_s32_s16(row_23.val[1]));
|
||||
|
||||
int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
|
||||
int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
|
||||
int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
|
||||
int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
|
||||
int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
|
||||
int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
|
||||
int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
|
||||
|
||||
/* Commence second pass of IDCT. */
|
||||
|
||||
/* Even part */
|
||||
int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
|
||||
int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
|
||||
tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
|
||||
|
||||
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
|
||||
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
|
||||
|
||||
/* Odd part */
|
||||
tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
|
||||
tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
|
||||
tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
|
||||
tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
|
||||
|
||||
tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
|
||||
tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
|
||||
tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
|
||||
tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
|
||||
|
||||
/* Final output stage: descale and clamp to range [0-255]. */
|
||||
int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
|
||||
vsubhn_s32(tmp12, tmp0));
|
||||
int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
|
||||
vsubhn_s32(tmp10, tmp2));
|
||||
output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
|
||||
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
|
||||
output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
|
||||
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
|
||||
/* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
|
||||
* An interleaving store completes the transpose.
|
||||
*/
|
||||
uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
|
||||
vqmovun_s16(output_cols_13));
|
||||
uint16x4x2_t output_01_23 = { {
|
||||
vreinterpret_u16_u8(output_0123.val[0]),
|
||||
vreinterpret_u16_u8(output_0123.val[1])
|
||||
} };
|
||||
|
||||
/* Store 4x4 block to memory. */
|
||||
JSAMPROW outptr0 = output_buf[0] + output_col;
|
||||
JSAMPROW outptr1 = output_buf[1] + output_col;
|
||||
JSAMPROW outptr2 = output_buf[2] + output_col;
|
||||
JSAMPROW outptr3 = output_buf[3] + output_col;
|
||||
vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
|
||||
vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
|
||||
vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
|
||||
vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
|
||||
}
|
||||
193
android/extern/libjpeg-turbo/simd/arm/jquanti-neon.c
vendored
Normal file
193
android/extern/libjpeg-turbo/simd/arm/jquanti-neon.c
vendored
Normal file
@@ -0,0 +1,193 @@
|
||||
/*
|
||||
* jquanti-neon.c - sample data conversion and quantization (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
/* After downsampling, the resulting sample values are in the range [0, 255],
|
||||
* but the Discrete Cosine Transform (DCT) operates on values centered around
|
||||
* 0.
|
||||
*
|
||||
* To prepare sample values for the DCT, load samples into a DCT workspace,
|
||||
* subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
|
||||
* are also widened from 8- to 16-bit.
|
||||
*
|
||||
* The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
|
||||
*/
|
||||
|
||||
void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM *workspace)
|
||||
{
|
||||
uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
|
||||
uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
|
||||
uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
|
||||
uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
|
||||
uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
|
||||
uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
|
||||
uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
|
||||
uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
|
||||
|
||||
int16x8_t row0 =
|
||||
vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
|
||||
int16x8_t row1 =
|
||||
vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
|
||||
int16x8_t row2 =
|
||||
vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
|
||||
int16x8_t row3 =
|
||||
vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
|
||||
int16x8_t row4 =
|
||||
vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
|
||||
int16x8_t row5 =
|
||||
vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
|
||||
int16x8_t row6 =
|
||||
vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
|
||||
int16x8_t row7 =
|
||||
vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
|
||||
|
||||
vst1q_s16(workspace + 0 * DCTSIZE, row0);
|
||||
vst1q_s16(workspace + 1 * DCTSIZE, row1);
|
||||
vst1q_s16(workspace + 2 * DCTSIZE, row2);
|
||||
vst1q_s16(workspace + 3 * DCTSIZE, row3);
|
||||
vst1q_s16(workspace + 4 * DCTSIZE, row4);
|
||||
vst1q_s16(workspace + 5 * DCTSIZE, row5);
|
||||
vst1q_s16(workspace + 6 * DCTSIZE, row6);
|
||||
vst1q_s16(workspace + 7 * DCTSIZE, row7);
|
||||
}
|
||||
|
||||
|
||||
/* After the DCT, the resulting array of coefficient values needs to be divided
|
||||
* by an array of quantization values.
|
||||
*
|
||||
* To avoid a slow division operation, the DCT coefficients are multiplied by
|
||||
* the (scaled) reciprocals of the quantization values and then right-shifted.
|
||||
*
|
||||
* The equivalent scalar C function quantize() can be found in jcdctmgr.c.
|
||||
*/
|
||||
|
||||
void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
|
||||
DCTELEM *workspace)
|
||||
{
|
||||
JCOEFPTR out_ptr = coef_block;
|
||||
UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
|
||||
UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
|
||||
DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
|
||||
int i;
|
||||
|
||||
#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
|
||||
#pragma unroll
|
||||
#endif
|
||||
for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
|
||||
/* Load DCT coefficients. */
|
||||
int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
|
||||
int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
|
||||
int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
|
||||
int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
|
||||
/* Load reciprocals of quantization values. */
|
||||
uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
|
||||
uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
|
||||
uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
|
||||
uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
|
||||
uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
|
||||
uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
|
||||
uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
|
||||
uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
|
||||
int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
|
||||
int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
|
||||
int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
|
||||
int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
|
||||
|
||||
/* Extract sign from coefficients. */
|
||||
int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
|
||||
int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
|
||||
int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
|
||||
int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
|
||||
/* Get absolute value of DCT coefficients. */
|
||||
uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
|
||||
uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
|
||||
uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
|
||||
uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
|
||||
/* Add correction. */
|
||||
abs_row0 = vaddq_u16(abs_row0, corr0);
|
||||
abs_row1 = vaddq_u16(abs_row1, corr1);
|
||||
abs_row2 = vaddq_u16(abs_row2, corr2);
|
||||
abs_row3 = vaddq_u16(abs_row3, corr3);
|
||||
|
||||
/* Multiply DCT coefficients by quantization reciprocals. */
|
||||
int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
|
||||
vget_low_u16(recip0)));
|
||||
int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
|
||||
vget_high_u16(recip0)));
|
||||
int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
|
||||
vget_low_u16(recip1)));
|
||||
int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
|
||||
vget_high_u16(recip1)));
|
||||
int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
|
||||
vget_low_u16(recip2)));
|
||||
int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
|
||||
vget_high_u16(recip2)));
|
||||
int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
|
||||
vget_low_u16(recip3)));
|
||||
int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
|
||||
vget_high_u16(recip3)));
|
||||
/* Narrow back to 16-bit. */
|
||||
row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
|
||||
row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
|
||||
row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
|
||||
row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
|
||||
|
||||
/* Since VSHR only supports an immediate as its second argument, negate the
|
||||
* shift value and shift left.
|
||||
*/
|
||||
row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
|
||||
vnegq_s16(shift0)));
|
||||
row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
|
||||
vnegq_s16(shift1)));
|
||||
row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
|
||||
vnegq_s16(shift2)));
|
||||
row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
|
||||
vnegq_s16(shift3)));
|
||||
|
||||
/* Restore sign to original product. */
|
||||
row0 = veorq_s16(row0, sign_row0);
|
||||
row0 = vsubq_s16(row0, sign_row0);
|
||||
row1 = veorq_s16(row1, sign_row1);
|
||||
row1 = vsubq_s16(row1, sign_row1);
|
||||
row2 = veorq_s16(row2, sign_row2);
|
||||
row2 = vsubq_s16(row2, sign_row2);
|
||||
row3 = veorq_s16(row3, sign_row3);
|
||||
row3 = vsubq_s16(row3, sign_row3);
|
||||
|
||||
/* Store quantized coefficients to memory. */
|
||||
vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
|
||||
vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
|
||||
vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
|
||||
vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
|
||||
}
|
||||
}
|
||||
37
android/extern/libjpeg-turbo/simd/arm/neon-compat.h.in
vendored
Normal file
37
android/extern/libjpeg-turbo/simd/arm/neon-compat.h.in
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
|
||||
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#cmakedefine HAVE_VLD1_S16_X3
|
||||
#cmakedefine HAVE_VLD1_U16_X2
|
||||
#cmakedefine HAVE_VLD1Q_U8_X4
|
||||
|
||||
/* Define compiler-independent count-leading-zeros and byte-swap macros */
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
|
||||
#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x)
|
||||
#define BUILTIN_BSWAP64(x) _byteswap_uint64(x)
|
||||
#elif defined(__clang__) || defined(__GNUC__)
|
||||
#define BUILTIN_CLZ(x) __builtin_clz(x)
|
||||
#define BUILTIN_CLZLL(x) __builtin_clzll(x)
|
||||
#define BUILTIN_BSWAP64(x) __builtin_bswap64(x)
|
||||
#else
|
||||
#error "Unknown compiler"
|
||||
#endif
|
||||
Reference in New Issue
Block a user