Update code to v1.0.14 (10)

This commit is contained in:
Caten
2024-02-29 19:35:00 +08:00
parent c2ee3b694c
commit a956d26f6d
3188 changed files with 2317293 additions and 146 deletions

View File

@@ -0,0 +1,148 @@
/*
* jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jccolor-neon.c */
/* RGB -> YCbCr conversion is defined by the following equations:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
*
* Avoid floating point arithmetic by using shifted integer constants:
* 0.29899597 = 19595 * 2^-16
* 0.58700561 = 38470 * 2^-16
* 0.11399841 = 7471 * 2^-16
* 0.16874695 = 11059 * 2^-16
* 0.33125305 = 21709 * 2^-16
* 0.50000000 = 32768 * 2^-16
* 0.41868592 = 27439 * 2^-16
* 0.08131409 = 5329 * 2^-16
* These constants are defined in jccolor-neon.c
*
* We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
* rounds up or down the result via integer truncation.
*/
void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
/* Pointer to RGB(X/A) input data */
JSAMPROW inptr;
/* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
/* Allocate temporary buffer for final (image_width % 8) pixels in row. */
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
/* Set up conversion constants. */
#ifdef HAVE_VLD1_U16_X2
const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x2(). */
const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
const uint16x4x2_t consts = { { consts1, consts2 } };
#endif
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
int cols_remaining = image_width;
for (; cols_remaining > 0; cols_remaining -= 8) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 8) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
if (cols_remaining < 8) {
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
#if RGB_PIXELSIZE == 4
uint8x8x4_t input_pixels = vld4_u8(inptr);
#else
uint8x8x3_t input_pixels = vld3_u8(inptr);
#endif
uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_low = scaled_128_5;
cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
uint32x4_t cb_high = scaled_128_5;
cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_low = scaled_128_5;
cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
uint32x4_t cr_high = scaled_128_5;
cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
vrshrn_n_u32(y_high, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
vshrn_n_u32(cb_high, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
vshrn_n_u32(cr_high, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1_u8(outptr0, vmovn_u16(y_u16));
vst1_u8(outptr1, vmovn_u16(cb_u16));
vst1_u8(outptr2, vmovn_u16(cr_u16));
/* Increment pointers. */
inptr += (8 * RGB_PIXELSIZE);
outptr0 += 8;
outptr1 += 8;
outptr2 += 8;
}
}
}

View File

@@ -0,0 +1,334 @@
/*
* jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* NOTE: All referenced figures are from
* Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include "../jchuff.h"
#include "neon-compat.h"
#include <limits.h>
#include <arm_neon.h>
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
uint8_t block_nbits[DCTSIZE2];
uint16_t block_diff[DCTSIZE2];
/* Load rows of coefficients from DCT block in zig-zag order. */
/* Compute DC coefficient difference value. (F.1.1.5.1) */
int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
row0 = vld1q_lane_s16(block + 1, row0, 1);
row0 = vld1q_lane_s16(block + 8, row0, 2);
row0 = vld1q_lane_s16(block + 16, row0, 3);
row0 = vld1q_lane_s16(block + 9, row0, 4);
row0 = vld1q_lane_s16(block + 2, row0, 5);
row0 = vld1q_lane_s16(block + 3, row0, 6);
row0 = vld1q_lane_s16(block + 10, row0, 7);
int16x8_t row1 = vld1q_dup_s16(block + 17);
row1 = vld1q_lane_s16(block + 24, row1, 1);
row1 = vld1q_lane_s16(block + 32, row1, 2);
row1 = vld1q_lane_s16(block + 25, row1, 3);
row1 = vld1q_lane_s16(block + 18, row1, 4);
row1 = vld1q_lane_s16(block + 11, row1, 5);
row1 = vld1q_lane_s16(block + 4, row1, 6);
row1 = vld1q_lane_s16(block + 5, row1, 7);
int16x8_t row2 = vld1q_dup_s16(block + 12);
row2 = vld1q_lane_s16(block + 19, row2, 1);
row2 = vld1q_lane_s16(block + 26, row2, 2);
row2 = vld1q_lane_s16(block + 33, row2, 3);
row2 = vld1q_lane_s16(block + 40, row2, 4);
row2 = vld1q_lane_s16(block + 48, row2, 5);
row2 = vld1q_lane_s16(block + 41, row2, 6);
row2 = vld1q_lane_s16(block + 34, row2, 7);
int16x8_t row3 = vld1q_dup_s16(block + 27);
row3 = vld1q_lane_s16(block + 20, row3, 1);
row3 = vld1q_lane_s16(block + 13, row3, 2);
row3 = vld1q_lane_s16(block + 6, row3, 3);
row3 = vld1q_lane_s16(block + 7, row3, 4);
row3 = vld1q_lane_s16(block + 14, row3, 5);
row3 = vld1q_lane_s16(block + 21, row3, 6);
row3 = vld1q_lane_s16(block + 28, row3, 7);
int16x8_t abs_row0 = vabsq_s16(row0);
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t row0_lz = vclzq_s16(abs_row0);
int16x8_t row1_lz = vclzq_s16(abs_row1);
int16x8_t row2_lz = vclzq_s16(abs_row2);
int16x8_t row3_lz = vclzq_s16(abs_row3);
/* Compute number of bits required to represent each coefficient. */
uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
uint16x8_t row0_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
vnegq_s16(row0_lz));
uint16x8_t row1_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
vnegq_s16(row1_lz));
uint16x8_t row2_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
vnegq_s16(row2_lz));
uint16x8_t row3_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
vnegq_s16(row3_lz));
uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
/* Store diff values for rows 0, 1, 2, and 3. */
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
/* Load last four rows of coefficients from DCT block in zig-zag order. */
int16x8_t row4 = vld1q_dup_s16(block + 35);
row4 = vld1q_lane_s16(block + 42, row4, 1);
row4 = vld1q_lane_s16(block + 49, row4, 2);
row4 = vld1q_lane_s16(block + 56, row4, 3);
row4 = vld1q_lane_s16(block + 57, row4, 4);
row4 = vld1q_lane_s16(block + 50, row4, 5);
row4 = vld1q_lane_s16(block + 43, row4, 6);
row4 = vld1q_lane_s16(block + 36, row4, 7);
int16x8_t row5 = vld1q_dup_s16(block + 29);
row5 = vld1q_lane_s16(block + 22, row5, 1);
row5 = vld1q_lane_s16(block + 15, row5, 2);
row5 = vld1q_lane_s16(block + 23, row5, 3);
row5 = vld1q_lane_s16(block + 30, row5, 4);
row5 = vld1q_lane_s16(block + 37, row5, 5);
row5 = vld1q_lane_s16(block + 44, row5, 6);
row5 = vld1q_lane_s16(block + 51, row5, 7);
int16x8_t row6 = vld1q_dup_s16(block + 58);
row6 = vld1q_lane_s16(block + 59, row6, 1);
row6 = vld1q_lane_s16(block + 52, row6, 2);
row6 = vld1q_lane_s16(block + 45, row6, 3);
row6 = vld1q_lane_s16(block + 38, row6, 4);
row6 = vld1q_lane_s16(block + 31, row6, 5);
row6 = vld1q_lane_s16(block + 39, row6, 6);
row6 = vld1q_lane_s16(block + 46, row6, 7);
int16x8_t row7 = vld1q_dup_s16(block + 53);
row7 = vld1q_lane_s16(block + 60, row7, 1);
row7 = vld1q_lane_s16(block + 61, row7, 2);
row7 = vld1q_lane_s16(block + 54, row7, 3);
row7 = vld1q_lane_s16(block + 47, row7, 4);
row7 = vld1q_lane_s16(block + 55, row7, 5);
row7 = vld1q_lane_s16(block + 62, row7, 6);
row7 = vld1q_lane_s16(block + 63, row7, 7);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
int16x8_t row4_lz = vclzq_s16(abs_row4);
int16x8_t row5_lz = vclzq_s16(abs_row5);
int16x8_t row6_lz = vclzq_s16(abs_row6);
int16x8_t row7_lz = vclzq_s16(abs_row7);
/* Compute number of bits required to represent each coefficient. */
uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
uint16x8_t row4_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
vnegq_s16(row4_lz));
uint16x8_t row5_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
vnegq_s16(row5_lz));
uint16x8_t row6_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
vnegq_s16(row6_lz));
uint16x8_t row7_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
vnegq_s16(row7_lz));
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
/* Store diff values for rows 4, 5, 6, and 7. */
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
* means that the corresponding coefficient != 0.
*/
uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
const uint8x8_t bitmap_mask =
vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
/* Shift left to remove DC bit. */
bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
/* Move bitmap to 32-bit scalar registers. */
uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
/* Set up state and bit buffer for output bitstream. */
working_state *state_ptr = (working_state *)state;
int free_bits = state_ptr->cur.free_bits;
size_t put_buffer = state_ptr->cur.put_buffer;
/* Encode DC coefficient. */
unsigned int nbits = block_nbits[0];
/* Emit Huffman-coded symbol and additional diff bits. */
unsigned int diff = block_diff[0];
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
/* Encode AC coefficients. */
unsigned int r = 0; /* r = run length of zeros */
unsigned int i = 1; /* i = number of coefficients encoded */
/* Code and size information for a run length of 16 zero coefficients */
const unsigned int code_0xf0 = actbl->ehufco[0xf0];
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
while (bitmap_1_32 != 0) {
r = BUILTIN_CLZ(bitmap_1_32);
i += r;
bitmap_1_32 <<= r;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap_1_32 <<= 1;
}
r = 33 - i;
i = 33;
while (bitmap_33_63 != 0) {
unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
r += leading_zeros;
i += leading_zeros;
bitmap_33_63 <<= leading_zeros;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
r = 0;
i++;
bitmap_33_63 <<= 1;
}
/* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
* The value of RS for the EOB code is 0.
*/
if (i != 64) {
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
}
state_ptr->cur.put_buffer = put_buffer;
state_ptr->cur.free_bits = free_bits;
return buffer;
}

View File

@@ -0,0 +1,976 @@
/*
* jsimd_arm.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
* Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
* Copyright (C) 2019, Google LLC.
* Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
* For conditions of distribution and use, see copyright notice in jsimdext.inc
*
* This file contains the interface between the "normal" portions
* of the library and the SIMD implementations when running on a
* 32-bit Arm architecture.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include <ctype.h>
static THREAD_LOCAL unsigned int simd_support = ~0;
static THREAD_LOCAL unsigned int simd_huffman = 1;
#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
LOCAL(int)
check_feature(char *buffer, char *feature)
{
char *p;
if (*feature == 0)
return 0;
if (strncmp(buffer, "Features", 8) != 0)
return 0;
buffer += 8;
while (isspace(*buffer))
buffer++;
/* Check if 'feature' is present in the buffer as a separate word */
while ((p = strstr(buffer, feature))) {
if (p > buffer && !isspace(*(p - 1))) {
buffer++;
continue;
}
p += strlen(feature);
if (*p != 0 && !isspace(*p)) {
buffer++;
continue;
}
return 1;
}
return 0;
}
LOCAL(int)
parse_proc_cpuinfo(int bufsize)
{
char *buffer = (char *)malloc(bufsize);
FILE *fd;
simd_support = 0;
if (!buffer)
return 0;
fd = fopen("/proc/cpuinfo", "r");
if (fd) {
while (fgets(buffer, bufsize, fd)) {
if (!strchr(buffer, '\n') && !feof(fd)) {
/* "impossible" happened - insufficient size of the buffer! */
fclose(fd);
free(buffer);
return 0;
}
if (check_feature(buffer, "neon"))
simd_support |= JSIMD_NEON;
}
fclose(fd);
}
free(buffer);
return 1;
}
#endif
/*
* Check what SIMD accelerations are supported.
*/
LOCAL(void)
init_simd(void)
{
#ifndef NO_GETENV
char env[2] = { 0 };
#endif
#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
int bufsize = 1024; /* an initial guess for the line buffer size limit */
#endif
if (simd_support != ~0U)
return;
simd_support = 0;
#if defined(__ARM_NEON__)
simd_support |= JSIMD_NEON;
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
/* We still have a chance to use Neon regardless of globally used
* -mcpu/-mfpu options passed to gcc by performing runtime detection via
* /proc/cpuinfo parsing on linux/android */
while (!parse_proc_cpuinfo(bufsize)) {
bufsize *= 2;
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
break;
}
#endif
#ifndef NO_GETENV
/* Force different settings through environment variables */
if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
simd_support = JSIMD_NEON;
if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
simd_support = 0;
if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
simd_huffman = 0;
#endif
}
GLOBAL(int)
jsimd_can_rgb_ycc(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_rgb_gray(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb565(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_extrgb_ycc_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_extrgbx_ycc_convert_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_extbgr_ycc_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_extbgrx_ycc_convert_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_extxbgr_ycc_convert_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_extxrgb_ycc_convert_neon;
break;
default:
neonfct = jsimd_extrgb_ycc_convert_neon;
break;
}
neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_extrgb_gray_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_extrgbx_gray_convert_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_extbgr_gray_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_extbgrx_gray_convert_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_extxbgr_gray_convert_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_extxrgb_gray_convert_neon;
break;
default:
neonfct = jsimd_extrgb_gray_convert_neon;
break;
}
neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_ycc_extrgb_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_ycc_extrgbx_convert_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_ycc_extbgr_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_ycc_extbgrx_convert_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_ycc_extxbgr_convert_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_ycc_extxrgb_convert_neon;
break;
default:
neonfct = jsimd_ycc_extrgb_convert_neon;
break;
}
neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
output_buf, num_rows);
}
GLOBAL(int)
jsimd_can_h2v2_downsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (DCTSIZE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_downsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (DCTSIZE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor, compptr->width_in_blocks,
input_data, output_data);
}
GLOBAL(void)
jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor, compptr->width_in_blocks,
input_data, output_data);
}
GLOBAL(int)
jsimd_can_h2v2_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h1v2_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(void)
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_merged_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
break;
default:
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
break;
}
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(void)
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
break;
default:
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
break;
}
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(int)
jsimd_can_convsamp(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_convsamp_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
jsimd_convsamp_neon(sample_data, start_col, workspace);
}
GLOBAL(void)
jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
FAST_FLOAT *workspace)
{
}
GLOBAL(int)
jsimd_can_fdct_islow(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_fdct_ifast(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_fdct_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_fdct_islow(DCTELEM *data)
{
jsimd_fdct_islow_neon(data);
}
GLOBAL(void)
jsimd_fdct_ifast(DCTELEM *data)
{
jsimd_fdct_ifast_neon(data);
}
GLOBAL(void)
jsimd_fdct_float(FAST_FLOAT *data)
{
}
GLOBAL(int)
jsimd_can_quantize(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_quantize_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
jsimd_quantize_neon(coef_block, divisors, workspace);
}
GLOBAL(void)
jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
FAST_FLOAT *workspace)
{
}
GLOBAL(int)
jsimd_can_idct_2x2(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(ISLOW_MULT_TYPE) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_4x4(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(ISLOW_MULT_TYPE) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
}
GLOBAL(void)
jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
}
GLOBAL(int)
jsimd_can_idct_islow(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(ISLOW_MULT_TYPE) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_ifast(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(IFAST_MULT_TYPE) != 2)
return 0;
if (IFAST_SCALE_BITS != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
output_col);
}
GLOBAL(void)
jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
output_col);
}
GLOBAL(void)
jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(int)
jsimd_can_huff_encode_one_block(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (simd_support & JSIMD_NEON && simd_huffman)
return 1;
return 0;
}
GLOBAL(JOCTET *)
jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
int last_dc_val, c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, UJCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, UJCOEF *absvalues, size_t *bits)
{
return jsimd_encode_mcu_AC_refine_prepare_neon(block,
jpeg_natural_order_start, Sl,
Al, absvalues, bits);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,316 @@
/*
* jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jccolor-neon.c */
/* RGB -> YCbCr conversion is defined by the following equations:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
*
* Avoid floating point arithmetic by using shifted integer constants:
* 0.29899597 = 19595 * 2^-16
* 0.58700561 = 38470 * 2^-16
* 0.11399841 = 7471 * 2^-16
* 0.16874695 = 11059 * 2^-16
* 0.33125305 = 21709 * 2^-16
* 0.50000000 = 32768 * 2^-16
* 0.41868592 = 27439 * 2^-16
* 0.08131409 = 5329 * 2^-16
* These constants are defined in jccolor-neon.c
*
* We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
* rounds up or down the result via integer truncation.
*/
void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
/* Pointer to RGB(X/A) input data */
JSAMPROW inptr;
/* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
/* Set up conversion constants. */
const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
int cols_remaining = image_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
#if RGB_PIXELSIZE == 4
uint8x16x4_t input_pixels = vld4q_u8(inptr);
#else
uint8x16x3_t input_pixels = vld3q_u8(inptr);
#endif
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_ll = scaled_128_5;
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
uint32x4_t cb_lh = scaled_128_5;
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
uint32x4_t cb_hl = scaled_128_5;
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
uint32x4_t cb_hh = scaled_128_5;
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_ll = scaled_128_5;
cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
uint32x4_t cr_lh = scaled_128_5;
cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
uint32x4_t cr_hl = scaled_128_5;
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
uint32x4_t cr_hh = scaled_128_5;
cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
vrshrn_n_u32(y_lh, 16));
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
vshrn_n_u32(cb_lh, 16));
uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
vshrn_n_u32(cb_hh, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
vshrn_n_u32(cr_lh, 16));
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
vshrn_n_u32(cr_hh, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
/* Increment pointers. */
inptr += (16 * RGB_PIXELSIZE);
outptr0 += 16;
outptr1 += 16;
outptr2 += 16;
}
if (cols_remaining > 8) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 16) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
#if RGB_PIXELSIZE == 4
uint8x16x4_t input_pixels = vld4q_u8(inptr);
#else
uint8x16x3_t input_pixels = vld3q_u8(inptr);
#endif
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_ll = scaled_128_5;
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
uint32x4_t cb_lh = scaled_128_5;
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
uint32x4_t cb_hl = scaled_128_5;
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
uint32x4_t cb_hh = scaled_128_5;
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_ll = scaled_128_5;
cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
uint32x4_t cr_lh = scaled_128_5;
cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
uint32x4_t cr_hl = scaled_128_5;
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
uint32x4_t cr_hh = scaled_128_5;
cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
vrshrn_n_u32(y_lh, 16));
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
vshrn_n_u32(cb_lh, 16));
uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
vshrn_n_u32(cb_hh, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
vshrn_n_u32(cr_lh, 16));
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
vshrn_n_u32(cr_hh, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
} else if (cols_remaining > 0) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 8) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
#if RGB_PIXELSIZE == 4
uint8x8x4_t input_pixels = vld4_u8(inptr);
#else
uint8x8x3_t input_pixels = vld3_u8(inptr);
#endif
uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_l = scaled_128_5;
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
uint32x4_t cb_h = scaled_128_5;
cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_l = scaled_128_5;
cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
uint32x4_t cr_h = scaled_128_5;
cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
vrshrn_n_u32(y_h, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
vshrn_n_u32(cb_h, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
vshrn_n_u32(cr_h, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1_u8(outptr0, vmovn_u16(y_u16));
vst1_u8(outptr1, vmovn_u16(cb_u16));
vst1_u8(outptr2, vmovn_u16(cr_u16));
}
}
}

View File

@@ -0,0 +1,411 @@
/*
* jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* NOTE: All referenced figures are from
* Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include "../align.h"
#include "../jchuff.h"
#include "neon-compat.h"
#include <limits.h>
#include <arm_neon.h>
ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
0, 1, 2, 3, 16, 17, 32, 33,
18, 19, 4, 5, 6, 7, 20, 21,
34, 35, 48, 49, 255, 255, 50, 51,
36, 37, 22, 23, 8, 9, 10, 11,
255, 255, 6, 7, 20, 21, 34, 35,
48, 49, 255, 255, 50, 51, 36, 37,
54, 55, 40, 41, 26, 27, 12, 13,
14, 15, 28, 29, 42, 43, 56, 57,
6, 7, 20, 21, 34, 35, 48, 49,
50, 51, 36, 37, 22, 23, 8, 9,
26, 27, 12, 13, 255, 255, 14, 15,
28, 29, 42, 43, 56, 57, 255, 255,
52, 53, 54, 55, 40, 41, 26, 27,
12, 13, 255, 255, 14, 15, 28, 29,
26, 27, 40, 41, 42, 43, 28, 29,
14, 15, 30, 31, 44, 45, 46, 47
};
/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
* address warning because the macro sometimes writes a 64-bit value to a
* non-64-bit-aligned address. That behavior is technically undefined per
* the C specification, but it is supported by the AArch64 architecture and
* compilers.
*/
#if defined(__has_feature)
#if __has_feature(undefined_behavior_sanitizer)
__attribute__((no_sanitize("alignment")))
#endif
#endif
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
uint16_t block_diff[DCTSIZE2];
/* Load lookup table indices for rows of zig-zag ordering. */
#ifdef HAVE_VLD1Q_U8_X4
const uint8x16x4_t idx_rows_0123 =
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
const uint8x16x4_t idx_rows_4567 =
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
#else
/* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
const uint8x16x4_t idx_rows_0123 = { {
vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
} };
const uint8x16x4_t idx_rows_4567 = { {
vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
} };
#endif
/* Load 8x8 block of DCT coefficients. */
#ifdef HAVE_VLD1Q_U8_X4
const int8x16x4_t tbl_rows_0123 =
vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
const int8x16x4_t tbl_rows_4567 =
vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
#else
const int8x16x4_t tbl_rows_0123 = { {
vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
} };
const int8x16x4_t tbl_rows_4567 = { {
vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
} };
#endif
/* Initialise extra lookup tables. */
const int8x16x4_t tbl_rows_2345 = { {
tbl_rows_0123.val[2], tbl_rows_0123.val[3],
tbl_rows_4567.val[0], tbl_rows_4567.val[1]
} };
const int8x16x3_t tbl_rows_567 =
{ { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
/* Shuffle coefficients into zig-zag order. */
int16x8_t row0 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
int16x8_t row1 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
int16x8_t row2 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
int16x8_t row3 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
int16x8_t row4 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
int16x8_t row5 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
int16x8_t row6 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
int16x8_t row7 =
vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
/* Compute DC coefficient difference value (F.1.1.5.1). */
row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
/* Initialize AC coefficient lanes not reachable by lookup tables. */
row1 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
0), row1, 2);
row2 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
4), row2, 0);
row2 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
0), row2, 5);
row5 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
7), row5, 2);
row5 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
3), row5, 7);
row6 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
7), row6, 5);
/* DCT block is now in zig-zag order; start Huffman encoding process. */
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
* means that the corresponding coefficient != 0.
*/
uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
vreinterpretq_u8_u16(row0_ne_0));
uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
vreinterpretq_u8_u16(row2_ne_0));
uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
vreinterpretq_u8_u16(row4_ne_0));
uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
vreinterpretq_u8_u16(row6_ne_0));
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
const uint8x16_t bitmap_mask =
vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
bitmap_rows_3210);
uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
vget_high_u8(bitmap_rows_76543210));
/* Shift left to remove DC bit. */
bitmap_all =
vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
/* Count bits set (number of non-zero coefficients) in bitmap. */
unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
/* Move bitmap to 64-bit scalar register. */
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Set up state and bit buffer for output bitstream. */
working_state *state_ptr = (working_state *)state;
int free_bits = state_ptr->cur.free_bits;
size_t put_buffer = state_ptr->cur.put_buffer;
/* Encode DC coefficient. */
/* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
int16x8_t abs_row0 = vabsq_s16(row0);
int16x8_t row0_lz = vclzq_s16(abs_row0);
uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
/* Find nbits required to specify sign and amplitude of coefficient. */
unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
unsigned int nbits = 16 - lz;
/* Emit Huffman-coded symbol and additional diff bits. */
unsigned int diff = vgetq_lane_u16(row0_diff, 0);
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
/* Encode AC coefficients. */
unsigned int r = 0; /* r = run length of zeros */
unsigned int i = 1; /* i = number of coefficients encoded */
/* Code and size information for a run length of 16 zero coefficients */
const unsigned int code_0xf0 = actbl->ehufco[0xf0];
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
/* The most efficient method of computing nbits and diff depends on the
* number of non-zero coefficients. If the bitmap is not too sparse (> 8
* non-zero AC coefficients), it is beneficial to do all of the work using
* Neon; else we do some of the work using Neon and the rest on demand using
* scalar code.
*/
if (non_zero_coefficients > 8) {
uint8_t block_nbits[DCTSIZE2];
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
int16x8_t row1_lz = vclzq_s16(abs_row1);
int16x8_t row2_lz = vclzq_s16(abs_row2);
int16x8_t row3_lz = vclzq_s16(abs_row3);
int16x8_t row4_lz = vclzq_s16(abs_row4);
int16x8_t row5_lz = vclzq_s16(abs_row5);
int16x8_t row6_lz = vclzq_s16(abs_row6);
int16x8_t row7_lz = vclzq_s16(abs_row7);
/* Narrow leading zero count to 8 bits. */
uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
vreinterpretq_u8_s16(row1_lz));
uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
vreinterpretq_u8_s16(row3_lz));
uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
vreinterpretq_u8_s16(row5_lz));
uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
vreinterpretq_u8_s16(row7_lz));
/* Compute nbits needed to specify magnitude of each coefficient. */
uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
/* Store nbits. */
vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
/* Mask bits not required to specify sign and amplitude of diff. */
uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
/* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
row1_mask);
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
row2_mask);
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
row3_mask);
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
row4_mask);
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
row5_mask);
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
row6_mask);
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
row7_mask);
/* Store diff bits. */
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
while (bitmap != 0) {
r = BUILTIN_CLZLL(bitmap);
i += r;
bitmap <<= r;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap <<= 1;
}
} else if (bitmap != 0) {
uint16_t block_abs[DCTSIZE2];
/* Compute and store absolute value of coefficients. */
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
/* Compute diff bits (without nbits mask) and store. */
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
vcltzq_s16(row1));
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
vcltzq_s16(row2));
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
vcltzq_s16(row3));
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
vcltzq_s16(row4));
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
vcltzq_s16(row5));
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
vcltzq_s16(row6));
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
vcltzq_s16(row7));
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
/* Same as above but must mask diff bits and compute nbits on demand. */
while (bitmap != 0) {
r = BUILTIN_CLZLL(bitmap);
i += r;
bitmap <<= r;
lz = BUILTIN_CLZ(block_abs[i]);
nbits = 32 - lz;
diff = ((unsigned int)block_diff[i] << lz) >> lz;
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap <<= 1;
}
}
/* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
* The value of RS for the EOB code is 0.
*/
if (i != 64) {
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
}
state_ptr->cur.put_buffer = put_buffer;
state_ptr->cur.free_bits = free_bits;
return buffer;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,28 @@
/*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* How to obtain memory alignment for structures and variables */
#if defined(_MSC_VER)
#define ALIGN(alignment) __declspec(align(alignment))
#elif defined(__clang__) || defined(__GNUC__)
#define ALIGN(alignment) __attribute__((aligned(alignment)))
#else
#error "Unknown compiler"
#endif

View File

@@ -0,0 +1,160 @@
/*
* jccolor-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
/* RGB -> YCbCr conversion constants */
#define F_0_298 19595
#define F_0_587 38470
#define F_0_113 7471
#define F_0_168 11059
#define F_0_331 21709
#define F_0_500 32768
#define F_0_418 27439
#define F_0_081 5329
ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
F_0_298, F_0_587, F_0_113, F_0_168,
F_0_331, F_0_500, F_0_418, F_0_081
};
/* Include inline routines for colorspace extensions. */
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon

View File

@@ -0,0 +1,120 @@
/*
* jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* RGB -> Grayscale conversion constants */
#define F_0_298 19595
#define F_0_587 38470
#define F_0_113 7471
/* Include inline routines for colorspace extensions. */
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon

View File

@@ -0,0 +1,106 @@
/*
* jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jcgray-neon.c */
/* RGB -> Grayscale conversion is defined by the following equation:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
*
* Avoid floating point arithmetic by using shifted integer constants:
* 0.29899597 = 19595 * 2^-16
* 0.58700561 = 38470 * 2^-16
* 0.11399841 = 7471 * 2^-16
* These constants are defined in jcgray-neon.c
*
* This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
*/
void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
JSAMPROW inptr;
JSAMPROW outptr;
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr = output_buf[0][output_row];
output_row++;
int cols_remaining = image_width;
for (; cols_remaining > 0; cols_remaining -= 16) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 16) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
if (cols_remaining < 16) {
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
#if RGB_PIXELSIZE == 4
uint8x16x4_t input_pixels = vld4q_u8(inptr);
#else
uint8x16x3_t input_pixels = vld3q_u8(inptr);
#endif
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
vrshrn_n_u32(y_lh, 16));
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
/* Narrow Y values to 8-bit and store to memory. Buffer overwrite is
* permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
/* Increment pointers. */
inptr += (16 * RGB_PIXELSIZE);
outptr += 16;
}
}
}

View File

@@ -0,0 +1,131 @@
/*
* jchuff.h
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2009, 2018, 2021, D. R. Commander.
* Copyright (C) 2018, Matthias Räncker.
* Copyright (C) 2020-2021, Arm Limited.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*/
/* Expanded entropy encoder object for Huffman encoding.
*
* The savable_state subrecord contains fields that change within an MCU,
* but must not be updated permanently until we complete the MCU.
*/
#if defined(__aarch64__) || defined(_M_ARM64)
#define BIT_BUF_SIZE 64
#else
#define BIT_BUF_SIZE 32
#endif
typedef struct {
size_t put_buffer; /* current bit accumulation buffer */
int free_bits; /* # of bits available in it */
int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
} savable_state;
typedef struct {
JOCTET *next_output_byte; /* => next byte to write in buffer */
size_t free_in_buffer; /* # of byte spaces remaining in buffer */
savable_state cur; /* Current bit buffer & DC state */
j_compress_ptr cinfo; /* dump_buffer needs access to this */
int simd;
} working_state;
/* Outputting bits to the file */
/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
* as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
* 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the
* speculative 0 byte will be overwritten by the next byte.
*/
#define EMIT_BYTE(b) { \
buffer[0] = (JOCTET)(b); \
buffer[1] = 0; \
buffer -= -2 + ((JOCTET)(b) < 0xFF); \
}
/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
* directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
* encode 0xFF as 0xFF 0x00.
*/
#if defined(__aarch64__) || defined(_M_ARM64)
#define FLUSH() { \
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
EMIT_BYTE(put_buffer >> 56) \
EMIT_BYTE(put_buffer >> 48) \
EMIT_BYTE(put_buffer >> 40) \
EMIT_BYTE(put_buffer >> 32) \
EMIT_BYTE(put_buffer >> 24) \
EMIT_BYTE(put_buffer >> 16) \
EMIT_BYTE(put_buffer >> 8) \
EMIT_BYTE(put_buffer ) \
} else { \
*((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
buffer += 8; \
} \
}
#else
#if defined(_MSC_VER) && !defined(__clang__)
#define SPLAT() { \
buffer[0] = (JOCTET)(put_buffer >> 24); \
buffer[1] = (JOCTET)(put_buffer >> 16); \
buffer[2] = (JOCTET)(put_buffer >> 8); \
buffer[3] = (JOCTET)(put_buffer ); \
buffer += 4; \
}
#else
#define SPLAT() { \
put_buffer = __builtin_bswap32(put_buffer); \
__asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
}
#endif
#define FLUSH() { \
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
EMIT_BYTE(put_buffer >> 24) \
EMIT_BYTE(put_buffer >> 16) \
EMIT_BYTE(put_buffer >> 8) \
EMIT_BYTE(put_buffer ) \
} else { \
SPLAT(); \
} \
}
#endif
/* Fill the bit buffer to capacity with the leading bits from code, then output
* the bit buffer and put the remaining bits from code into the bit buffer.
*/
#define PUT_AND_FLUSH(code, size) { \
put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
FLUSH() \
free_bits += BIT_BUF_SIZE; \
put_buffer = code; \
}
/* Insert code into the bit buffer and output the bit buffer if needed.
* NOTE: We can't flush with free_bits == 0, since the left shift in
* PUT_AND_FLUSH() would have undefined behavior.
*/
#define PUT_BITS(code, size) { \
free_bits -= size; \
if (free_bits < 0) \
PUT_AND_FLUSH(code, size) \
else \
put_buffer = (put_buffer << size) | code; \
}
#define PUT_CODE(code, size, diff) { \
diff |= code << nbits; \
nbits += size; \
PUT_BITS(diff, nbits) \
}

View File

@@ -0,0 +1,623 @@
/*
* jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
* Copyright (C) 2022, Matthieu Darbois. All Rights Reserved.
* Copyright (C) 2022, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "neon-compat.h"
#include <arm_neon.h>
/* Data preparation for encode_mcu_AC_first().
*
* The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
* found in jcphuff.c.
*/
void jsimd_encode_mcu_AC_first_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
UJCOEF *values, size_t *zerobits)
{
UJCOEF *values_ptr = values;
UJCOEF *diff_values_ptr = values + DCTSIZE2;
/* Rows of coefficients to zero (since they haven't been processed) */
int i, rows_to_zero = 8;
for (i = 0; i < Sl / 16; i++) {
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
/* Isolate sign of coefficients. */
uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
/* Compute absolute value of coefficients and apply point transform Al. */
uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
/* Compute diff values. */
uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
/* Store transformed coefficients and diff values. */
vst1q_u16(values_ptr, abs_coefs1);
vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
vst1q_u16(diff_values_ptr, diff1);
vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
values_ptr += 16;
diff_values_ptr += 16;
jpeg_natural_order_start += 16;
rows_to_zero -= 2;
}
/* Same operation but for remaining partial vector */
int remaining_coefs = Sl % 16;
if (remaining_coefs > 8) {
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
int16x8_t coefs2 = vdupq_n_s16(0);
switch (remaining_coefs) {
case 15:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
/* Isolate sign of coefficients. */
uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
/* Compute absolute value of coefficients and apply point transform Al. */
uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
/* Compute diff values. */
uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
/* Store transformed coefficients and diff values. */
vst1q_u16(values_ptr, abs_coefs1);
vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
vst1q_u16(diff_values_ptr, diff1);
vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
values_ptr += 16;
diff_values_ptr += 16;
rows_to_zero -= 2;
} else if (remaining_coefs > 0) {
int16x8_t coefs = vdupq_n_s16(0);
switch (remaining_coefs) {
case 8:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
FALLTHROUGH /*FALLTHROUGH*/
case 7:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
/* Isolate sign of coefficients. */
uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
/* Compute absolute value of coefficients and apply point transform Al. */
uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
/* Compute diff values. */
uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
/* Store transformed coefficients and diff values. */
vst1q_u16(values_ptr, abs_coefs);
vst1q_u16(diff_values_ptr, diff);
values_ptr += 8;
diff_values_ptr += 8;
rows_to_zero--;
}
/* Zero remaining memory in the values and diff_values blocks. */
for (i = 0; i < rows_to_zero; i++) {
vst1q_u16(values_ptr, vdupq_n_u16(0));
vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
values_ptr += 8;
diff_values_ptr += 8;
}
/* Construct zerobits bitmap. A set bit means that the corresponding
* coefficient != 0.
*/
uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
const uint8x8_t bitmap_mask =
vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Move bitmap to a 64-bit scalar register. */
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Store zerobits bitmap. */
*zerobits = ~bitmap;
#else
/* Move bitmap to two 32-bit scalar registers. */
uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
/* Store zerobits bitmap. */
zerobits[0] = ~bitmap0;
zerobits[1] = ~bitmap1;
#endif
}
/* Data preparation for encode_mcu_AC_refine().
*
* The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
* found in jcphuff.c.
*/
int jsimd_encode_mcu_AC_refine_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
UJCOEF *absvalues, size_t *bits)
{
/* Temporary storage buffers for data used to compute the signbits bitmap and
* the end-of-block (EOB) position
*/
uint8_t coef_sign_bits[64];
uint8_t coef_eq1_bits[64];
UJCOEF *absvalues_ptr = absvalues;
uint8_t *coef_sign_bits_ptr = coef_sign_bits;
uint8_t *eq1_bits_ptr = coef_eq1_bits;
/* Rows of coefficients to zero (since they haven't been processed) */
int i, rows_to_zero = 8;
for (i = 0; i < Sl / 16; i++) {
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
/* Compute and store data for signbits bitmap. */
uint8x8_t sign_coefs1 =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
uint8x8_t sign_coefs2 =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
vst1_u8(coef_sign_bits_ptr, sign_coefs1);
vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
/* Compute absolute value of coefficients and apply point transform Al. */
uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
vst1q_u16(absvalues_ptr, abs_coefs1);
vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq11);
vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
absvalues_ptr += 16;
coef_sign_bits_ptr += 16;
eq1_bits_ptr += 16;
jpeg_natural_order_start += 16;
rows_to_zero -= 2;
}
/* Same operation but for remaining partial vector */
int remaining_coefs = Sl % 16;
if (remaining_coefs > 8) {
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
int16x8_t coefs2 = vdupq_n_s16(0);
switch (remaining_coefs) {
case 15:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
/* Compute and store data for signbits bitmap. */
uint8x8_t sign_coefs1 =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
uint8x8_t sign_coefs2 =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
vst1_u8(coef_sign_bits_ptr, sign_coefs1);
vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
/* Compute absolute value of coefficients and apply point transform Al. */
uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
vst1q_u16(absvalues_ptr, abs_coefs1);
vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq11);
vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
absvalues_ptr += 16;
coef_sign_bits_ptr += 16;
eq1_bits_ptr += 16;
jpeg_natural_order_start += 16;
rows_to_zero -= 2;
} else if (remaining_coefs > 0) {
int16x8_t coefs = vdupq_n_s16(0);
switch (remaining_coefs) {
case 8:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
FALLTHROUGH /*FALLTHROUGH*/
case 7:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
/* Compute and store data for signbits bitmap. */
uint8x8_t sign_coefs =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
vst1_u8(coef_sign_bits_ptr, sign_coefs);
/* Compute absolute value of coefficients and apply point transform Al. */
uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
vst1q_u16(absvalues_ptr, abs_coefs);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq1);
absvalues_ptr += 8;
coef_sign_bits_ptr += 8;
eq1_bits_ptr += 8;
rows_to_zero--;
}
/* Zero remaining memory in blocks. */
for (i = 0; i < rows_to_zero; i++) {
vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
absvalues_ptr += 8;
coef_sign_bits_ptr += 8;
eq1_bits_ptr += 8;
}
/* Construct zerobits bitmap. */
uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
const uint8x8_t bitmap_mask =
vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Move bitmap to a 64-bit scalar register. */
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Store zerobits bitmap. */
bits[0] = ~bitmap;
#else
/* Move bitmap to two 32-bit scalar registers. */
uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
/* Store zerobits bitmap. */
bits[0] = ~bitmap0;
bits[1] = ~bitmap1;
#endif
/* Construct signbits bitmap. */
uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Move bitmap to a 64-bit scalar register. */
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Store signbits bitmap. */
bits[1] = ~bitmap;
#else
/* Move bitmap to two 32-bit scalar registers. */
bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
/* Store signbits bitmap. */
bits[2] = ~bitmap0;
bits[3] = ~bitmap1;
#endif
/* Construct bitmap to find EOB position (the index of the last coefficient
* equal to 1.)
*/
uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Move bitmap to a 64-bit scalar register. */
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Return EOB position. */
if (bitmap == 0) {
/* EOB position is defined to be 0 if all coefficients != 1. */
return 0;
} else {
return 63 - BUILTIN_CLZLL(bitmap);
}
#else
/* Move bitmap to two 32-bit scalar registers. */
bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
/* Return EOB position. */
if (bitmap0 == 0 && bitmap1 == 0) {
return 0;
} else if (bitmap1 != 0) {
return 63 - BUILTIN_CLZ(bitmap1);
} else {
return 31 - BUILTIN_CLZ(bitmap0);
}
#endif
}

View File

@@ -0,0 +1,192 @@
/*
* jcsample-neon.c - downsampling (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
/* Downsample pixel values of a single component.
* This version handles the common case of 2:1 horizontal and 1:1 vertical,
* without smoothing.
*/
void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
JSAMPROW inptr, outptr;
/* Load expansion mask to pad remaining elements of last DCT block. */
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
const uint8x16_t expand_mask =
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
/* Load bias pattern (alternating every pixel.) */
/* { 0, 1, 0, 1, 0, 1, 0, 1 } */
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
unsigned i, outrow;
for (outrow = 0; outrow < v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr = input_data[outrow];
/* Downsample all but the last DCT block of pixels. */
for (i = 0; i < width_in_blocks - 1; i++) {
uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
/* Add adjacent pixel values, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
/* Divide total by 2 and narrow to 8-bit. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
/* Store samples to memory. */
vst1_u8(outptr + i * DCTSIZE, samples_u8);
}
/* Load pixels in last DCT block into a table. */
uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Pad the empty elements with the value of the last pixel. */
pixels = vqtbl1q_u8(pixels, expand_mask);
#else
uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
vtbl2_u8(table, vget_high_u8(expand_mask)));
#endif
/* Add adjacent pixel values, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
/* Divide total by 2, narrow to 8-bit, and store. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
}
}
/* Downsample pixel values of a single component.
* This version handles the standard case of 2:1 horizontal and 2:1 vertical,
* without smoothing.
*/
void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
JSAMPROW inptr0, inptr1, outptr;
/* Load expansion mask to pad remaining elements of last DCT block. */
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
const uint8x16_t expand_mask =
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
/* Load bias pattern (alternating every pixel.) */
/* { 1, 2, 1, 2, 1, 2, 1, 2 } */
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
unsigned i, outrow;
for (outrow = 0; outrow < v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr0 = input_data[outrow];
inptr1 = input_data[outrow + 1];
/* Downsample all but the last DCT block of pixels. */
for (i = 0; i < width_in_blocks - 1; i++) {
uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
*/
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
/* Divide total by 4 and narrow to 8-bit. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
/* Store samples to memory and increment pointers. */
vst1_u8(outptr + i * DCTSIZE, samples_u8);
}
/* Load pixels in last DCT block into a table. */
uint8x16_t pixels_r0 =
vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
uint8x16_t pixels_r1 =
vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Pad the empty elements with the value of the last pixel. */
pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
#else
uint8x8x2_t table_r0 =
{ { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
uint8x8x2_t table_r1 =
{ { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
#endif
/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
/* Divide total by 4, narrow to 8-bit, and store. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
}
}

View File

@@ -0,0 +1,374 @@
/*
* jdcolext-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jdcolor-neon.c. */
/* YCbCr -> RGB conversion is defined by the following equations:
* R = Y + 1.40200 * (Cr - 128)
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
* B = Y + 1.77200 * (Cb - 128)
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.3441467 = 11277 * 2^-15
* 0.7141418 = 23401 * 2^-15
* 1.4020386 = 22971 * 2^-14
* 1.7720337 = 29033 * 2^-14
* These constants are defined in jdcolor-neon.c.
*
* To ensure correct results, rounding is used when descaling.
*/
/* Notes on safe memory access for YCbCr -> RGB conversion routines:
*
* Input memory buffers can be safely overread up to the next multiple of
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
* jmemmgr.c.
*
* The output buffer cannot safely be written beyond output_width, since
* output_buf points to a possibly unpadded row in the decompressed image
* buffer allocated by the calling program.
*/
void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
JSAMPROW outptr;
/* Pointers to Y, Cb, and Cr data */
JSAMPROW inptr0, inptr1, inptr2;
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
while (--num_rows >= 0) {
inptr0 = input_buf[0][input_row];
inptr1 = input_buf[1][input_row];
inptr2 = input_buf[2][input_row];
input_row++;
outptr = *output_buf++;
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
uint8x16_t y = vld1q_u8(inptr0);
uint8x16_t cb = vld1q_u8(inptr1);
uint8x16_t cr = vld1q_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_low_u8(cr)));
int16x8_t cr_128_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_high_u8(cr)));
int16x8_t cb_128_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_low_u8(cb)));
int16x8_t cb_128_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_high_u8(cb)));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
consts, 0);
int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
consts, 0);
g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
consts, 1);
g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
consts, 1);
g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
consts, 1);
g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
vrshrn_n_s32(g_sub_y_lh, 15));
int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
vrshrn_n_s32(g_sub_y_hh, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
consts, 2);
int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
consts, 3);
int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
consts, 3);
/* Add Y. */
int16x8_t r_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
vget_low_u8(y)));
int16x8_t r_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
vget_high_u8(y)));
int16x8_t b_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
vget_low_u8(y)));
int16x8_t b_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
vget_high_u8(y)));
int16x8_t g_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
vget_low_u8(y)));
int16x8_t g_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
vget_high_u8(y)));
#if RGB_PIXELSIZE == 4
uint8x16x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr, rgba);
#elif RGB_PIXELSIZE == 3
uint8x16x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
/* Store RGB pixel data to memory. */
vst3q_u8(outptr, rgb);
#else
/* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
/* Store RGB pixel data to memory. */
vst1q_u16((uint16_t *)outptr, rgb565_l);
vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
#endif
/* Increment pointers. */
inptr0 += 16;
inptr1 += 16;
inptr2 += 16;
outptr += (RGB_PIXELSIZE * 16);
}
if (cols_remaining >= 8) {
uint8x8_t y = vld1_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
consts, 3);
/* Add Y. */
int16x8_t r =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
int16x8_t b =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
int16x8_t g =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
#if RGB_PIXELSIZE == 4
uint8x8x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vqmovun_s16(r);
rgba.val[RGB_GREEN] = vqmovun_s16(g);
rgba.val[RGB_BLUE] = vqmovun_s16(b);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4_u8(outptr, rgba);
#elif RGB_PIXELSIZE == 3
uint8x8x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vqmovun_s16(r);
rgb.val[RGB_GREEN] = vqmovun_s16(g);
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
vst3_u8(outptr, rgb);
#else
/* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
/* Store RGB pixel data to memory. */
vst1q_u16((uint16_t *)outptr, rgb565);
#endif
/* Increment pointers. */
inptr0 += 8;
inptr1 += 8;
inptr2 += 8;
outptr += (RGB_PIXELSIZE * 8);
cols_remaining -= 8;
}
/* Handle the tail elements. */
if (cols_remaining > 0) {
uint8x8_t y = vld1_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
consts, 3);
/* Add Y. */
int16x8_t r =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
int16x8_t b =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
int16x8_t g =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
#if RGB_PIXELSIZE == 4
uint8x8x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vqmovun_s16(r);
rgba.val[RGB_GREEN] = vqmovun_s16(g);
rgba.val[RGB_BLUE] = vqmovun_s16(b);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
case 7:
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst4_lane_u8(outptr, rgba, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#elif RGB_PIXELSIZE == 3
uint8x8x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vqmovun_s16(r);
rgb.val[RGB_GREEN] = vqmovun_s16(g);
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
case 7:
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst3_lane_u8(outptr, rgb, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#else
/* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
/* Store RGB565 pixel data to memory. */
switch (cols_remaining) {
case 7:
vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#endif
}
}
}

View File

@@ -0,0 +1,141 @@
/*
* jdcolor-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* YCbCr -> RGB conversion constants */
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
-F_0_344, F_0_714, F_1_402, F_1_772
};
/* Include inline routines for colorspace extensions. */
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
/* YCbCr -> RGB565 Conversion */
#define RGB_PIXELSIZE 2
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
#include "jdcolext-neon.c"
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon

View File

@@ -0,0 +1,144 @@
/*
* jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* YCbCr -> RGB conversion constants */
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
-F_0_344, F_0_714, F_1_402, F_1_772
};
/* Include inline routines for colorspace extensions. */
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon

View File

@@ -0,0 +1,723 @@
/*
* jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jdmerge-neon.c. */
/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
* chroma upsampling and YCbCr -> RGB color conversion into a single function.
*
* As with the standalone functions, YCbCr -> RGB conversion is defined by the
* following equations:
* R = Y + 1.40200 * (Cr - 128)
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
* B = Y + 1.77200 * (Cb - 128)
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.3441467 = 11277 * 2^-15
* 0.7141418 = 23401 * 2^-15
* 1.4020386 = 22971 * 2^-14
* 1.7720337 = 29033 * 2^-14
* These constants are defined in jdmerge-neon.c.
*
* To ensure correct results, rounding is used when descaling.
*/
/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
* routines:
*
* Input memory buffers can be safely overread up to the next multiple of
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
* jmemmgr.c.
*
* The output buffer cannot safely be written beyond output_width, since
* output_buf points to a possibly unpadded row in the decompressed image
* buffer allocated by the calling program.
*/
/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
*/
void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
JSAMPROW outptr;
/* Pointers to Y, Cb, and Cr data */
JSAMPROW inptr0, inptr1, inptr2;
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
inptr0 = input_buf[0][in_row_group_ctr];
inptr1 = input_buf[1][in_row_group_ctr];
inptr2 = input_buf[2][in_row_group_ctr];
outptr = output_buf[0];
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
/* De-interleave Y component values into two separate vectors, one
* containing the component values with even-numbered indices and one
* containing the component values with odd-numbered indices.
*/
uint8x8x2_t y = vld2_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
* "odd" Y component values. This effectively upsamples the chroma
* components horizontally.
*/
int16x8_t g_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[0]));
int16x8_t r_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[0]));
int16x8_t b_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[0]));
int16x8_t g_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[1]));
int16x8_t r_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[1]));
int16x8_t b_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
#ifdef RGB_ALPHA
uint8x16x4_t rgba;
rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr, rgba);
#else
uint8x16x3_t rgb;
rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
/* Store RGB pixel data to memory. */
vst3q_u8(outptr, rgb);
#endif
/* Increment pointers. */
inptr0 += 16;
inptr1 += 8;
inptr2 += 8;
outptr += (RGB_PIXELSIZE * 16);
}
if (cols_remaining > 0) {
/* De-interleave Y component values into two separate vectors, one
* containing the component values with even-numbered indices and one
* containing the component values with odd-numbered indices.
*/
uint8x8x2_t y = vld2_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
* "odd" Y component values. This effectively upsamples the chroma
* components horizontally.
*/
int16x8_t g_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[0]));
int16x8_t r_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[0]));
int16x8_t b_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[0]));
int16x8_t g_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[1]));
int16x8_t r_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[1]));
int16x8_t b_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
#ifdef RGB_ALPHA
uint8x8x4_t rgba_h;
rgba_h.val[RGB_RED] = r.val[1];
rgba_h.val[RGB_GREEN] = g.val[1];
rgba_h.val[RGB_BLUE] = b.val[1];
/* Set alpha channel to opaque (0xFF). */
rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
uint8x8x4_t rgba_l;
rgba_l.val[RGB_RED] = r.val[0];
rgba_l.val[RGB_GREEN] = g.val[0];
rgba_l.val[RGB_BLUE] = b.val[0];
/* Set alpha channel to opaque (0xFF). */
rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
case 15:
vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst4_u8(outptr, rgba_l);
break;
case 7:
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst4_lane_u8(outptr, rgba_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#else
uint8x8x3_t rgb_h;
rgb_h.val[RGB_RED] = r.val[1];
rgb_h.val[RGB_GREEN] = g.val[1];
rgb_h.val[RGB_BLUE] = b.val[1];
uint8x8x3_t rgb_l;
rgb_l.val[RGB_RED] = r.val[0];
rgb_l.val[RGB_GREEN] = g.val[0];
rgb_l.val[RGB_BLUE] = b.val[0];
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
case 15:
vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst3_u8(outptr, rgb_l);
break;
case 7:
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst3_lane_u8(outptr, rgb_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#endif
}
}
/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
*
* See comments above for details regarding color conversion and safe memory
* access.
*/
void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
JSAMPROW outptr0, outptr1;
/* Pointers to Y (both rows), Cb, and Cr data */
JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
inptr0_0 = input_buf[0][in_row_group_ctr * 2];
inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
inptr1 = input_buf[1][in_row_group_ctr];
inptr2 = input_buf[2][in_row_group_ctr];
outptr0 = output_buf[0];
outptr1 = output_buf[1];
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
/* For each row, de-interleave Y component values into two separate
* vectors, one containing the component values with even-numbered indices
* and one containing the component values with odd-numbered indices.
*/
uint8x8x2_t y0 = vld2_u8(inptr0_0);
uint8x8x2_t y1 = vld2_u8(inptr0_1);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
* the "even" and "odd" Y component values. This effectively upsamples the
* chroma components both horizontally and vertically.
*/
int16x8_t g0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y0.val[0]));
int16x8_t r0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y0.val[0]));
int16x8_t b0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y0.val[0]));
int16x8_t g0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y0.val[1]));
int16x8_t r0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y0.val[1]));
int16x8_t b0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y0.val[1]));
int16x8_t g1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y1.val[0]));
int16x8_t r1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y1.val[0]));
int16x8_t b1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y1.val[0]));
int16x8_t g1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y1.val[1]));
int16x8_t r1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y1.val[1]));
int16x8_t b1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y1.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
#ifdef RGB_ALPHA
uint8x16x4_t rgba0, rgba1;
rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
/* Set alpha channel to opaque (0xFF). */
rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr0, rgba0);
vst4q_u8(outptr1, rgba1);
#else
uint8x16x3_t rgb0, rgb1;
rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
/* Store RGB pixel data to memory. */
vst3q_u8(outptr0, rgb0);
vst3q_u8(outptr1, rgb1);
#endif
/* Increment pointers. */
inptr0_0 += 16;
inptr0_1 += 16;
inptr1 += 8;
inptr2 += 8;
outptr0 += (RGB_PIXELSIZE * 16);
outptr1 += (RGB_PIXELSIZE * 16);
}
if (cols_remaining > 0) {
/* For each row, de-interleave Y component values into two separate
* vectors, one containing the component values with even-numbered indices
* and one containing the component values with odd-numbered indices.
*/
uint8x8x2_t y0 = vld2_u8(inptr0_0);
uint8x8x2_t y1 = vld2_u8(inptr0_1);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
* the "even" and "odd" Y component values. This effectively upsamples the
* chroma components both horizontally and vertically.
*/
int16x8_t g0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y0.val[0]));
int16x8_t r0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y0.val[0]));
int16x8_t b0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y0.val[0]));
int16x8_t g0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y0.val[1]));
int16x8_t r0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y0.val[1]));
int16x8_t b0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y0.val[1]));
int16x8_t g1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y1.val[0]));
int16x8_t r1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y1.val[0]));
int16x8_t b1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y1.val[0]));
int16x8_t g1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y1.val[1]));
int16x8_t r1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y1.val[1]));
int16x8_t b1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y1.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
#ifdef RGB_ALPHA
uint8x8x4_t rgba0_h, rgba1_h;
rgba0_h.val[RGB_RED] = r0.val[1];
rgba1_h.val[RGB_RED] = r1.val[1];
rgba0_h.val[RGB_GREEN] = g0.val[1];
rgba1_h.val[RGB_GREEN] = g1.val[1];
rgba0_h.val[RGB_BLUE] = b0.val[1];
rgba1_h.val[RGB_BLUE] = b1.val[1];
/* Set alpha channel to opaque (0xFF). */
rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
uint8x8x4_t rgba0_l, rgba1_l;
rgba0_l.val[RGB_RED] = r0.val[0];
rgba1_l.val[RGB_RED] = r1.val[0];
rgba0_l.val[RGB_GREEN] = g0.val[0];
rgba1_l.val[RGB_GREEN] = g1.val[0];
rgba0_l.val[RGB_BLUE] = b0.val[0];
rgba1_l.val[RGB_BLUE] = b1.val[0];
/* Set alpha channel to opaque (0xFF). */
rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
case 15:
vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst4_u8(outptr0, rgba0_l);
vst4_u8(outptr1, rgba1_l);
break;
case 7:
vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst4_lane_u8(outptr0, rgba0_l, 0);
vst4_lane_u8(outptr1, rgba1_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#else
uint8x8x3_t rgb0_h, rgb1_h;
rgb0_h.val[RGB_RED] = r0.val[1];
rgb1_h.val[RGB_RED] = r1.val[1];
rgb0_h.val[RGB_GREEN] = g0.val[1];
rgb1_h.val[RGB_GREEN] = g1.val[1];
rgb0_h.val[RGB_BLUE] = b0.val[1];
rgb1_h.val[RGB_BLUE] = b1.val[1];
uint8x8x3_t rgb0_l, rgb1_l;
rgb0_l.val[RGB_RED] = r0.val[0];
rgb1_l.val[RGB_RED] = r1.val[0];
rgb0_l.val[RGB_GREEN] = g0.val[0];
rgb1_l.val[RGB_GREEN] = g1.val[0];
rgb0_l.val[RGB_BLUE] = b0.val[0];
rgb1_l.val[RGB_BLUE] = b1.val[0];
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
case 15:
vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst3_u8(outptr0, rgb0_l);
vst3_u8(outptr1, rgb1_l);
break;
case 7:
vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst3_lane_u8(outptr0, rgb0_l, 0);
vst3_lane_u8(outptr1, rgb1_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#endif
}
}

View File

@@ -0,0 +1,569 @@
/*
* jdsample-neon.c - upsampling (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include <arm_neon.h>
/* The diagram below shows a row of samples produced by h2v1 downsampling.
*
* s0 s1 s2
* +---------+---------+---------+
* | | | |
* | p0 p1 | p2 p3 | p4 p5 |
* | | | |
* +---------+---------+---------+
*
* Samples s0-s2 were created by averaging the original pixel component values
* centered at positions p0-p5 above. To approximate those original pixel
* component values, we proportionally blend the adjacent samples in each row.
*
* An upsampled pixel component value is computed by blending the sample
* containing the pixel center with the nearest neighboring sample, in the
* ratio 3:1. For example:
* p1(upsampled) = 3/4 * s0 + 1/4 * s1
* p2(upsampled) = 3/4 * s1 + 1/4 * s0
* When computing the first and last pixel component values in the row, there
* is no adjacent sample to blend, so:
* p0(upsampled) = s0
* p5(upsampled) = s2
*/
void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
int inrow;
unsigned colctr;
/* Set up constants. */
const uint16x8_t one_u16 = vdupq_n_u16(1);
const uint8x8_t three_u8 = vdup_n_u8(3);
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow];
/* First pixel component value in this row of the original image */
*outptr = (JSAMPLE)GETJSAMPLE(*inptr);
/* 3/4 * containing sample + 1/4 * nearest neighboring sample
* For p1: containing sample = s0, nearest neighboring sample = s1
* For p2: containing sample = s1, nearest neighboring sample = s0
*/
uint8x16_t s0 = vld1q_u8(inptr);
uint8x16_t s1 = vld1q_u8(inptr + 1);
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
* denote low half and high half respectively.
*/
uint16x8_t s1_add_3s0_l =
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
uint16x8_t s1_add_3s0_h =
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
uint16x8_t s0_add_3s1_l =
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
uint16x8_t s0_add_3s1_h =
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
/* Add ordered dithering bias to odd pixel values. */
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
/* The offset is initially 1, because the first pixel component has already
* been stored. However, in subsequent iterations of the SIMD loop, this
* offset is (2 * colctr - 1) to stay within the bounds of the sample
* buffers without having to resort to a slow scalar tail case for the last
* (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
* in jmemmgr.c for more details.
*/
unsigned outptr_offset = 1;
uint8x16x2_t output_pixels;
/* We use software pipelining to maximise performance. The code indented
* an extra two spaces begins the next iteration of the loop.
*/
for (colctr = 16; colctr < downsampled_width; colctr += 16) {
s0 = vld1q_u8(inptr + colctr - 1);
s1 = vld1q_u8(inptr + colctr);
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
vrshrn_n_u16(s1_add_3s0_h, 2));
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
vshrn_n_u16(s0_add_3s1_h, 2));
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
* denote low half and high half respectively.
*/
s1_add_3s0_l =
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
s1_add_3s0_h =
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
s0_add_3s1_l =
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
s0_add_3s1_h =
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
/* Add ordered dithering bias to odd pixel values. */
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
/* Store pixel component values to memory. */
vst2q_u8(outptr + outptr_offset, output_pixels);
outptr_offset = 2 * colctr - 1;
}
/* Complete the last iteration of the loop. */
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
vrshrn_n_u16(s1_add_3s0_h, 2));
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
vshrn_n_u16(s0_add_3s1_h, 2));
/* Store pixel component values to memory. */
vst2q_u8(outptr + outptr_offset, output_pixels);
/* Last pixel component value in this row of the original image */
outptr[2 * downsampled_width - 1] =
GETJSAMPLE(inptr[downsampled_width - 1]);
}
}
/* The diagram below shows an array of samples produced by h2v2 downsampling.
*
* s0 s1 s2
* +---------+---------+---------+
* | p0 p1 | p2 p3 | p4 p5 |
* sA | | | |
* | p6 p7 | p8 p9 | p10 p11|
* +---------+---------+---------+
* | p12 p13| p14 p15| p16 p17|
* sB | | | |
* | p18 p19| p20 p21| p22 p23|
* +---------+---------+---------+
* | p24 p25| p26 p27| p28 p29|
* sC | | | |
* | p30 p31| p32 p33| p34 p35|
* +---------+---------+---------+
*
* Samples s0A-s2C were created by averaging the original pixel component
* values centered at positions p0-p35 above. To approximate one of those
* original pixel component values, we proportionally blend the sample
* containing the pixel center with the nearest neighboring samples in each
* row, column, and diagonal.
*
* An upsampled pixel component value is computed by first blending the sample
* containing the pixel center with the nearest neighboring samples in the
* same column, in the ratio 3:1, and then blending each column sum with the
* nearest neighboring column sum, in the ratio 3:1. For example:
* p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
* 1/4 * (3/4 * s0B + 1/4 * s0A)
* = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
* When computing the first and last pixel component values in the row, there
* is no horizontally adjacent sample to blend, so:
* p12(upsampled) = 3/4 * s0B + 1/4 * s0A
* p23(upsampled) = 3/4 * s2B + 1/4 * s2C
* When computing the first and last pixel component values in the column,
* there is no vertically adjacent sample to blend, so:
* p2(upsampled) = 3/4 * s1A + 1/4 * s0A
* p33(upsampled) = 3/4 * s1C + 1/4 * s2C
* When computing the corner pixel component values, there is no adjacent
* sample to blend, so:
* p0(upsampled) = s0A
* p35(upsampled) = s2C
*/
void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
int inrow, outrow;
unsigned colctr;
/* Set up constants. */
const uint16x8_t seven_u16 = vdupq_n_u16(7);
const uint8x8_t three_u8 = vdup_n_u8(3);
const uint16x8_t three_u16 = vdupq_n_u16(3);
inrow = outrow = 0;
while (outrow < max_v_samp_factor) {
inptr0 = input_data[inrow - 1];
inptr1 = input_data[inrow];
inptr2 = input_data[inrow + 1];
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
* respectively.
*/
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
/* First pixel component value in this row of the original image */
int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
*outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
*outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
/* Step 1: Blend samples vertically in columns s0 and s1.
* Leave the divide by 4 until the end, when it can be done for both
* dimensions at once, right-shifting by 4.
*/
/* Load and compute s0colsum0 and s0colsum1. */
uint8x16_t s0A = vld1q_u8(inptr0);
uint8x16_t s0B = vld1q_u8(inptr1);
uint8x16_t s0C = vld1q_u8(inptr2);
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
* denote low half and high half respectively.
*/
uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
vget_low_u8(s0B), three_u8);
uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
vget_high_u8(s0B), three_u8);
uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
vget_low_u8(s0B), three_u8);
uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
vget_high_u8(s0B), three_u8);
/* Load and compute s1colsum0 and s1colsum1. */
uint8x16_t s1A = vld1q_u8(inptr0 + 1);
uint8x16_t s1B = vld1q_u8(inptr1 + 1);
uint8x16_t s1C = vld1q_u8(inptr2 + 1);
uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
vget_low_u8(s1B), three_u8);
uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
vget_high_u8(s1B), three_u8);
uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
vget_low_u8(s1B), three_u8);
uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
vget_high_u8(s1B), three_u8);
/* Step 2: Blend the already-blended columns. */
uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
/* Add ordered dithering bias to odd pixel values. */
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
/* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
uint8x16x2_t output_pixels0 = { {
vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
} };
uint8x16x2_t output_pixels1 = { {
vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
} };
/* Store pixel component values to memory.
* The minimum size of the output buffer for each row is 64 bytes => no
* need to worry about buffer overflow here. See "Creation of 2-D sample
* arrays" in jmemmgr.c for more details.
*/
vst2q_u8(outptr0 + 1, output_pixels0);
vst2q_u8(outptr1 + 1, output_pixels1);
/* The first pixel of the image shifted our loads and stores by one byte.
* We have to re-align on a 32-byte boundary at some point before the end
* of the row (we do it now on the 32/33 pixel boundary) to stay within the
* bounds of the sample buffers without having to resort to a slow scalar
* tail case for the last (downsampled_width % 16) samples. See "Creation
* of 2-D sample arrays" in jmemmgr.c for more details.
*/
for (colctr = 16; colctr < downsampled_width; colctr += 16) {
/* Step 1: Blend samples vertically in columns s0 and s1. */
/* Load and compute s0colsum0 and s0colsum1. */
s0A = vld1q_u8(inptr0 + colctr - 1);
s0B = vld1q_u8(inptr1 + colctr - 1);
s0C = vld1q_u8(inptr2 + colctr - 1);
s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
three_u8);
s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
three_u8);
s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
three_u8);
s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
three_u8);
/* Load and compute s1colsum0 and s1colsum1. */
s1A = vld1q_u8(inptr0 + colctr);
s1B = vld1q_u8(inptr1 + colctr);
s1C = vld1q_u8(inptr2 + colctr);
s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
three_u8);
s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
three_u8);
s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
three_u8);
s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
three_u8);
/* Step 2: Blend the already-blended columns. */
output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
/* Add ordered dithering bias to odd pixel values. */
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
/* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
vshrn_n_u16(output0_p1_h, 4));
output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
vrshrn_n_u16(output0_p2_h, 4));
output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
vshrn_n_u16(output1_p1_h, 4));
output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
vrshrn_n_u16(output1_p2_h, 4));
/* Store pixel component values to memory. */
vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
}
/* Last pixel component value in this row of the original image */
int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
GETJSAMPLE(inptr0[downsampled_width - 1]);
outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
GETJSAMPLE(inptr2[downsampled_width - 1]);
outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
inrow++;
}
}
/* The diagram below shows a column of samples produced by h1v2 downsampling
* (or by losslessly rotating or transposing an h2v1-downsampled image.)
*
* +---------+
* | p0 |
* sA | |
* | p1 |
* +---------+
* | p2 |
* sB | |
* | p3 |
* +---------+
* | p4 |
* sC | |
* | p5 |
* +---------+
*
* Samples sA-sC were created by averaging the original pixel component values
* centered at positions p0-p5 above. To approximate those original pixel
* component values, we proportionally blend the adjacent samples in each
* column.
*
* An upsampled pixel component value is computed by blending the sample
* containing the pixel center with the nearest neighboring sample, in the
* ratio 3:1. For example:
* p1(upsampled) = 3/4 * sA + 1/4 * sB
* p2(upsampled) = 3/4 * sB + 1/4 * sA
* When computing the first and last pixel component values in the column,
* there is no adjacent sample to blend, so:
* p0(upsampled) = sA
* p5(upsampled) = sC
*/
void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
int inrow, outrow;
unsigned colctr;
/* Set up constants. */
const uint16x8_t one_u16 = vdupq_n_u16(1);
const uint8x8_t three_u8 = vdup_n_u8(3);
inrow = outrow = 0;
while (outrow < max_v_samp_factor) {
inptr0 = input_data[inrow - 1];
inptr1 = input_data[inrow];
inptr2 = input_data[inrow + 1];
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
* respectively.
*/
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
inrow++;
/* The size of the input and output buffers is always a multiple of 32
* bytes => no need to worry about buffer overflow when reading/writing
* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
* details.
*/
for (colctr = 0; colctr < downsampled_width; colctr += 16) {
/* Load samples. */
uint8x16_t sA = vld1q_u8(inptr0 + colctr);
uint8x16_t sB = vld1q_u8(inptr1 + colctr);
uint8x16_t sC = vld1q_u8(inptr2 + colctr);
/* Blend samples vertically. */
uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
vget_low_u8(sB), three_u8);
uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
vget_high_u8(sB), three_u8);
uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
vget_low_u8(sB), three_u8);
uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
vget_high_u8(sB), three_u8);
/* Add ordered dithering bias to pixel values in even output rows. */
colsum0_l = vaddq_u16(colsum0_l, one_u16);
colsum0_h = vaddq_u16(colsum0_h, one_u16);
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
vshrn_n_u16(colsum0_h, 2));
uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
vrshrn_n_u16(colsum1_h, 2));
/* Store pixel component values to memory. */
vst1q_u8(outptr0 + colctr, output_pixels0);
vst1q_u8(outptr1 + colctr, output_pixels1);
}
}
}
/* The diagram below shows a row of samples produced by h2v1 downsampling.
*
* s0 s1
* +---------+---------+
* | | |
* | p0 p1 | p2 p3 |
* | | |
* +---------+---------+
*
* Samples s0 and s1 were created by averaging the original pixel component
* values centered at positions p0-p3 above. To approximate those original
* pixel component values, we duplicate the samples horizontally:
* p0(upsampled) = p1(upsampled) = s0
* p2(upsampled) = p3(upsampled) = s1
*/
void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
int inrow;
unsigned colctr;
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow];
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
uint8x16_t samples = vld1q_u8(inptr + colctr);
/* Duplicate the samples. The store operation below interleaves them so
* that adjacent pixel component values take on the same sample value,
* per above.
*/
uint8x16x2_t output_pixels = { { samples, samples } };
/* Store pixel component values to memory.
* Due to the way sample buffers are allocated, we don't need to worry
* about tail cases when output_width is not a multiple of 32. See
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
*/
vst2q_u8(outptr + 2 * colctr, output_pixels);
}
}
}
/* The diagram below shows an array of samples produced by h2v2 downsampling.
*
* s0 s1
* +---------+---------+
* | p0 p1 | p2 p3 |
* sA | | |
* | p4 p5 | p6 p7 |
* +---------+---------+
* | p8 p9 | p10 p11|
* sB | | |
* | p12 p13| p14 p15|
* +---------+---------+
*
* Samples s0A-s1B were created by averaging the original pixel component
* values centered at positions p0-p15 above. To approximate those original
* pixel component values, we duplicate the samples both horizontally and
* vertically:
* p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
* p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
* p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
* p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
*/
void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr0, outptr1;
int inrow, outrow;
unsigned colctr;
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
uint8x16_t samples = vld1q_u8(inptr + colctr);
/* Duplicate the samples. The store operation below interleaves them so
* that adjacent pixel component values take on the same sample value,
* per above.
*/
uint8x16x2_t output_pixels = { { samples, samples } };
/* Store pixel component values for both output rows to memory.
* Due to the way sample buffers are allocated, we don't need to worry
* about tail cases when output_width is not a multiple of 32. See
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
*/
vst2q_u8(outptr0 + 2 * colctr, output_pixels);
vst2q_u8(outptr1 + 2 * colctr, output_pixels);
}
}
}

View File

@@ -0,0 +1,214 @@
/*
* jfdctfst-neon.c - fast integer FDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
* (Discrete Cosine Transform) on one block of samples. It uses the same
* calculations and produces exactly the same output as IJG's original
* jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.382683433 = 12544 * 2^-15
* 0.541196100 = 17795 * 2^-15
* 0.707106781 = 23168 * 2^-15
* 0.306562965 = 9984 * 2^-15
*
* See jfdctfst.c for further details of the DCT algorithm. Where possible,
* the variable names and comments here in jsimd_fdct_ifast_neon() match up
* with those in jpeg_fdct_ifast().
*/
#define F_0_382 12544
#define F_0_541 17792
#define F_0_707 23168
#define F_0_306 9984
ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
F_0_382, F_0_541, F_0_707, F_0_306
};
void jsimd_fdct_ifast_neon(DCTELEM *data)
{
/* Load an 8x8 block of samples into Neon registers. De-interleaving loads
* are used, followed by vuzp to transpose the block such that we have a
* column of samples per vector - allowing all rows to be processed at once.
*/
int16x8x4_t data1 = vld4q_s16(data);
int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
int16x8_t col0 = cols_04.val[0];
int16x8_t col1 = cols_15.val[0];
int16x8_t col2 = cols_26.val[0];
int16x8_t col3 = cols_37.val[0];
int16x8_t col4 = cols_04.val[1];
int16x8_t col5 = cols_15.val[1];
int16x8_t col6 = cols_26.val[1];
int16x8_t col7 = cols_37.val[1];
/* Pass 1: process rows. */
/* Load DCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
int16x8_t tmp0 = vaddq_s16(col0, col7);
int16x8_t tmp7 = vsubq_s16(col0, col7);
int16x8_t tmp1 = vaddq_s16(col1, col6);
int16x8_t tmp6 = vsubq_s16(col1, col6);
int16x8_t tmp2 = vaddq_s16(col2, col5);
int16x8_t tmp5 = vsubq_s16(col2, col5);
int16x8_t tmp3 = vaddq_s16(col3, col4);
int16x8_t tmp4 = vsubq_s16(col3, col4);
/* Even part */
int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
col4 = vsubq_s16(tmp10, tmp11);
int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
col2 = vaddq_s16(tmp13, z1); /* phase 5 */
col6 = vsubq_s16(tmp13, z1);
/* Odd part */
tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
tmp11 = vaddq_s16(tmp5, tmp6);
tmp12 = vaddq_s16(tmp6, tmp7);
int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
z2 = vaddq_s16(z2, z5);
int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
z5 = vaddq_s16(tmp12, z5);
z4 = vaddq_s16(z4, z5);
int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
int16x8_t z13 = vsubq_s16(tmp7, z3);
col5 = vaddq_s16(z13, z2); /* phase 6 */
col3 = vsubq_s16(z13, z2);
col1 = vaddq_s16(z11, z4);
col7 = vsubq_s16(z11, z4);
/* Transpose to work on columns in pass 2. */
int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
vreinterpretq_s32_s16(cols_45.val[0]));
int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
vreinterpretq_s32_s16(cols_45.val[1]));
int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
vreinterpretq_s32_s16(cols_67.val[0]));
int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
vreinterpretq_s32_s16(cols_67.val[1]));
int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
/* Pass 2: process columns. */
tmp0 = vaddq_s16(row0, row7);
tmp7 = vsubq_s16(row0, row7);
tmp1 = vaddq_s16(row1, row6);
tmp6 = vsubq_s16(row1, row6);
tmp2 = vaddq_s16(row2, row5);
tmp5 = vsubq_s16(row2, row5);
tmp3 = vaddq_s16(row3, row4);
tmp4 = vsubq_s16(row3, row4);
/* Even part */
tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
tmp13 = vsubq_s16(tmp0, tmp3);
tmp11 = vaddq_s16(tmp1, tmp2);
tmp12 = vsubq_s16(tmp1, tmp2);
row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
row4 = vsubq_s16(tmp10, tmp11);
z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
row2 = vaddq_s16(tmp13, z1); /* phase 5 */
row6 = vsubq_s16(tmp13, z1);
/* Odd part */
tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
tmp11 = vaddq_s16(tmp5, tmp6);
tmp12 = vaddq_s16(tmp6, tmp7);
z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
z2 = vaddq_s16(z2, z5);
z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
z5 = vaddq_s16(tmp12, z5);
z4 = vaddq_s16(z4, z5);
z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
z11 = vaddq_s16(tmp7, z3); /* phase 5 */
z13 = vsubq_s16(tmp7, z3);
row5 = vaddq_s16(z13, z2); /* phase 6 */
row3 = vsubq_s16(z13, z2);
row1 = vaddq_s16(z11, z4);
row7 = vsubq_s16(z11, z4);
vst1q_s16(data + 0 * DCTSIZE, row0);
vst1q_s16(data + 1 * DCTSIZE, row1);
vst1q_s16(data + 2 * DCTSIZE, row2);
vst1q_s16(data + 3 * DCTSIZE, row3);
vst1q_s16(data + 4 * DCTSIZE, row4);
vst1q_s16(data + 5 * DCTSIZE, row5);
vst1q_s16(data + 6 * DCTSIZE, row6);
vst1q_s16(data + 7 * DCTSIZE, row7);
}

View File

@@ -0,0 +1,376 @@
/*
* jfdctint-neon.c - accurate integer FDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
* (Discrete Cosine Transform) on one block of samples. It uses the same
* calculations and produces exactly the same output as IJG's original
* jpeg_fdct_islow() function, which can be found in jfdctint.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.298631336 = 2446 * 2^-13
* 0.390180644 = 3196 * 2^-13
* 0.541196100 = 4433 * 2^-13
* 0.765366865 = 6270 * 2^-13
* 0.899976223 = 7373 * 2^-13
* 1.175875602 = 9633 * 2^-13
* 1.501321110 = 12299 * 2^-13
* 1.847759065 = 15137 * 2^-13
* 1.961570560 = 16069 * 2^-13
* 2.053119869 = 16819 * 2^-13
* 2.562915447 = 20995 * 2^-13
* 3.072711026 = 25172 * 2^-13
*
* See jfdctint.c for further details of the DCT algorithm. Where possible,
* the variable names and comments here in jsimd_fdct_islow_neon() match up
* with those in jpeg_fdct_islow().
*/
#define CONST_BITS 13
#define PASS1_BITS 2
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
#define F_0_298 2446
#define F_0_390 3196
#define F_0_541 4433
#define F_0_765 6270
#define F_0_899 7373
#define F_1_175 9633
#define F_1_501 12299
#define F_1_847 15137
#define F_1_961 16069
#define F_2_053 16819
#define F_2_562 20995
#define F_3_072 25172
ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
F_0_298, -F_0_390, F_0_541, F_0_765,
-F_0_899, F_1_175, F_1_501, -F_1_847,
-F_1_961, F_2_053, -F_2_562, F_3_072
};
void jsimd_fdct_islow_neon(DCTELEM *data)
{
/* Load DCT constants. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */
const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Load an 8x8 block of samples into Neon registers. De-interleaving loads
* are used, followed by vuzp to transpose the block such that we have a
* column of samples per vector - allowing all rows to be processed at once.
*/
int16x8x4_t s_rows_0123 = vld4q_s16(data);
int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
int16x8_t col0 = cols_04.val[0];
int16x8_t col1 = cols_15.val[0];
int16x8_t col2 = cols_26.val[0];
int16x8_t col3 = cols_37.val[0];
int16x8_t col4 = cols_04.val[1];
int16x8_t col5 = cols_15.val[1];
int16x8_t col6 = cols_26.val[1];
int16x8_t col7 = cols_37.val[1];
/* Pass 1: process rows. */
int16x8_t tmp0 = vaddq_s16(col0, col7);
int16x8_t tmp7 = vsubq_s16(col0, col7);
int16x8_t tmp1 = vaddq_s16(col1, col6);
int16x8_t tmp6 = vsubq_s16(col1, col6);
int16x8_t tmp2 = vaddq_s16(col2, col5);
int16x8_t tmp5 = vsubq_s16(col2, col5);
int16x8_t tmp3 = vaddq_s16(col3, col4);
int16x8_t tmp4 = vsubq_s16(col3, col4);
/* Even part */
int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
int32x4_t z1_l =
vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
int32x4_t z1_h =
vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
int32x4_t col2_scaled_l =
vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
int32x4_t col2_scaled_h =
vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
int32x4_t col6_scaled_l =
vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
int32x4_t col6_scaled_h =
vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
/* Odd part */
int16x8_t z1 = vaddq_s16(tmp4, tmp7);
int16x8_t z2 = vaddq_s16(tmp5, tmp6);
int16x8_t z3 = vaddq_s16(tmp4, tmp6);
int16x8_t z4 = vaddq_s16(tmp5, tmp7);
/* sqrt(2) * c3 */
int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
/* sqrt(2) * (-c1+c3+c5-c7) */
int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
/* sqrt(2) * ( c1+c3-c5+c7) */
int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
/* sqrt(2) * ( c1+c3+c5-c7) */
int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
/* sqrt(2) * ( c1+c3-c5-c7) */
int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
/* sqrt(2) * (c7-c3) */
z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
/* sqrt(2) * (-c1-c3) */
int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
/* sqrt(2) * (-c3-c5) */
int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
/* sqrt(2) * (c5-c3) */
int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
z3_l = vaddq_s32(z3_l, z5_l);
z3_h = vaddq_s32(z3_h, z5_h);
z4_l = vaddq_s32(z4_l, z5_l);
z4_h = vaddq_s32(z4_h, z5_h);
tmp4_l = vaddq_s32(tmp4_l, z1_l);
tmp4_h = vaddq_s32(tmp4_h, z1_h);
tmp4_l = vaddq_s32(tmp4_l, z3_l);
tmp4_h = vaddq_s32(tmp4_h, z3_h);
col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
vrshrn_n_s32(tmp4_h, DESCALE_P1));
tmp5_l = vaddq_s32(tmp5_l, z2_l);
tmp5_h = vaddq_s32(tmp5_h, z2_h);
tmp5_l = vaddq_s32(tmp5_l, z4_l);
tmp5_h = vaddq_s32(tmp5_h, z4_h);
col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
vrshrn_n_s32(tmp5_h, DESCALE_P1));
tmp6_l = vaddq_s32(tmp6_l, z2_l);
tmp6_h = vaddq_s32(tmp6_h, z2_h);
tmp6_l = vaddq_s32(tmp6_l, z3_l);
tmp6_h = vaddq_s32(tmp6_h, z3_h);
col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
vrshrn_n_s32(tmp6_h, DESCALE_P1));
tmp7_l = vaddq_s32(tmp7_l, z1_l);
tmp7_h = vaddq_s32(tmp7_h, z1_h);
tmp7_l = vaddq_s32(tmp7_l, z4_l);
tmp7_h = vaddq_s32(tmp7_h, z4_h);
col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
vrshrn_n_s32(tmp7_h, DESCALE_P1));
/* Transpose to work on columns in pass 2. */
int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
vreinterpretq_s32_s16(cols_45.val[0]));
int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
vreinterpretq_s32_s16(cols_45.val[1]));
int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
vreinterpretq_s32_s16(cols_67.val[0]));
int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
vreinterpretq_s32_s16(cols_67.val[1]));
int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
/* Pass 2: process columns. */
tmp0 = vaddq_s16(row0, row7);
tmp7 = vsubq_s16(row0, row7);
tmp1 = vaddq_s16(row1, row6);
tmp6 = vsubq_s16(row1, row6);
tmp2 = vaddq_s16(row2, row5);
tmp5 = vsubq_s16(row2, row5);
tmp3 = vaddq_s16(row3, row4);
tmp4 = vsubq_s16(row3, row4);
/* Even part */
tmp10 = vaddq_s16(tmp0, tmp3);
tmp13 = vsubq_s16(tmp0, tmp3);
tmp11 = vaddq_s16(tmp1, tmp2);
tmp12 = vsubq_s16(tmp1, tmp2);
row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
int32x4_t row2_scaled_l =
vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
int32x4_t row2_scaled_h =
vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
int32x4_t row6_scaled_l =
vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
int32x4_t row6_scaled_h =
vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
/* Odd part */
z1 = vaddq_s16(tmp4, tmp7);
z2 = vaddq_s16(tmp5, tmp6);
z3 = vaddq_s16(tmp4, tmp6);
z4 = vaddq_s16(tmp5, tmp7);
/* sqrt(2) * c3 */
z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
/* sqrt(2) * (-c1+c3+c5-c7) */
tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
/* sqrt(2) * ( c1+c3-c5+c7) */
tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
/* sqrt(2) * ( c1+c3+c5-c7) */
tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
/* sqrt(2) * ( c1+c3-c5-c7) */
tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
/* sqrt(2) * (c7-c3) */
z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
/* sqrt(2) * (-c1-c3) */
z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
/* sqrt(2) * (-c3-c5) */
z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
/* sqrt(2) * (c5-c3) */
z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
z3_l = vaddq_s32(z3_l, z5_l);
z3_h = vaddq_s32(z3_h, z5_h);
z4_l = vaddq_s32(z4_l, z5_l);
z4_h = vaddq_s32(z4_h, z5_h);
tmp4_l = vaddq_s32(tmp4_l, z1_l);
tmp4_h = vaddq_s32(tmp4_h, z1_h);
tmp4_l = vaddq_s32(tmp4_l, z3_l);
tmp4_h = vaddq_s32(tmp4_h, z3_h);
row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
vrshrn_n_s32(tmp4_h, DESCALE_P2));
tmp5_l = vaddq_s32(tmp5_l, z2_l);
tmp5_h = vaddq_s32(tmp5_h, z2_h);
tmp5_l = vaddq_s32(tmp5_l, z4_l);
tmp5_h = vaddq_s32(tmp5_h, z4_h);
row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
vrshrn_n_s32(tmp5_h, DESCALE_P2));
tmp6_l = vaddq_s32(tmp6_l, z2_l);
tmp6_h = vaddq_s32(tmp6_h, z2_h);
tmp6_l = vaddq_s32(tmp6_l, z3_l);
tmp6_h = vaddq_s32(tmp6_h, z3_h);
row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
vrshrn_n_s32(tmp6_h, DESCALE_P2));
tmp7_l = vaddq_s32(tmp7_l, z1_l);
tmp7_h = vaddq_s32(tmp7_h, z1_h);
tmp7_l = vaddq_s32(tmp7_l, z4_l);
tmp7_h = vaddq_s32(tmp7_h, z4_h);
row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
vrshrn_n_s32(tmp7_h, DESCALE_P2));
vst1q_s16(data + 0 * DCTSIZE, row0);
vst1q_s16(data + 1 * DCTSIZE, row1);
vst1q_s16(data + 2 * DCTSIZE, row2);
vst1q_s16(data + 3 * DCTSIZE, row3);
vst1q_s16(data + 4 * DCTSIZE, row4);
vst1q_s16(data + 5 * DCTSIZE, row5);
vst1q_s16(data + 6 * DCTSIZE, row6);
vst1q_s16(data + 7 * DCTSIZE, row7);
}

View File

@@ -0,0 +1,472 @@
/*
* jidctfst-neon.c - fast integer IDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
* inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
* uses the same calculations and produces exactly the same output as IJG's
* original jpeg_idct_ifast() function, which can be found in jidctfst.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.082392200 = 2688 * 2^-15
* 0.414213562 = 13568 * 2^-15
* 0.847759065 = 27776 * 2^-15
* 0.613125930 = 20096 * 2^-15
*
* See jidctfst.c for further details of the IDCT algorithm. Where possible,
* the variable names and comments here in jsimd_idct_ifast_neon() match up
* with those in jpeg_idct_ifast().
*/
#define PASS1_BITS 2
#define F_0_082 2688
#define F_0_414 13568
#define F_0_847 27776
#define F_0_613 20096
ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
F_0_082, F_0_414, F_0_847, F_0_613
};
void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
IFAST_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values for DC coefficients. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
/* Dequantize DC coefficients. */
row0 = vmulq_s16(row0, quant_row0);
/* Construct bitmap to test if all AC coefficients are 0. */
int16x8_t bitmap = vorrq_s16(row1, row2);
bitmap = vorrq_s16(bitmap, row3);
bitmap = vorrq_s16(bitmap, row4);
bitmap = vorrq_s16(bitmap, row5);
bitmap = vorrq_s16(bitmap, row6);
bitmap = vorrq_s16(bitmap, row7);
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
/* Load IDCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
/* All AC coefficients are zero.
* Compute DC values and duplicate into vectors.
*/
int16x8_t dcval = row0;
row1 = dcval;
row2 = dcval;
row3 = dcval;
row4 = dcval;
row5 = dcval;
row6 = dcval;
row7 = dcval;
} else if (left_ac_bitmap == 0) {
/* AC coefficients are zero for columns 0, 1, 2, and 3.
* Use DC values for these columns.
*/
int16x4_t dcval = vget_low_s16(row0);
/* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Even part: dequantize DCT coefficients. */
int16x4_t tmp0 = vget_high_s16(row0);
int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsub_s16(tmp12, tmp13);
tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsub_s16(tmp10, tmp13);
tmp1 = vadd_s16(tmp11, tmp12);
tmp2 = vsub_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
int16x4_t z11 = vadd_s16(tmp4, tmp7);
int16x4_t z12 = vsub_s16(tmp4, tmp7);
tmp7 = vadd_s16(z11, z13); /* phase 5 */
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vadd_s16(tmp11, z11_sub_z13);
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
z5 = vadd_s16(z5, z10_add_z12);
tmp10 = vqdmulh_lane_s16(z12, consts, 0);
tmp10 = vadd_s16(tmp10, z12);
tmp10 = vsub_s16(tmp10, z5);
tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
tmp12 = vadd_s16(tmp12, z5);
tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsub_s16(tmp11, tmp6);
tmp4 = vadd_s16(tmp10, tmp5);
row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
} else if (right_ac_bitmap == 0) {
/* AC coefficients are zero for columns 4, 5, 6, and 7.
* Use DC values for these columns.
*/
int16x4_t dcval = vget_high_s16(row0);
/* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Even part: dequantize DCT coefficients. */
int16x4_t tmp0 = vget_low_s16(row0);
int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsub_s16(tmp12, tmp13);
tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsub_s16(tmp10, tmp13);
tmp1 = vadd_s16(tmp11, tmp12);
tmp2 = vsub_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
int16x4_t z11 = vadd_s16(tmp4, tmp7);
int16x4_t z12 = vsub_s16(tmp4, tmp7);
tmp7 = vadd_s16(z11, z13); /* phase 5 */
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vadd_s16(tmp11, z11_sub_z13);
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
z5 = vadd_s16(z5, z10_add_z12);
tmp10 = vqdmulh_lane_s16(z12, consts, 0);
tmp10 = vadd_s16(tmp10, z12);
tmp10 = vsub_s16(tmp10, z5);
tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
tmp12 = vadd_s16(tmp12, z5);
tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsub_s16(tmp11, tmp6);
tmp4 = vadd_s16(tmp10, tmp5);
row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
} else {
/* Some AC coefficients are non-zero; full IDCT calculation required. */
/* Load quantization table. */
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Even part: dequantize DCT coefficients. */
int16x8_t tmp0 = row0;
int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsubq_s16(tmp12, tmp13);
tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsubq_s16(tmp10, tmp13);
tmp1 = vaddq_s16(tmp11, tmp12);
tmp2 = vsubq_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
int16x8_t z11 = vaddq_s16(tmp4, tmp7);
int16x8_t z12 = vsubq_s16(tmp4, tmp7);
tmp7 = vaddq_s16(z11, z13); /* phase 5 */
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
z5 = vaddq_s16(z5, z10_add_z12);
tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
tmp10 = vaddq_s16(tmp10, z12);
tmp10 = vsubq_s16(tmp10, z5);
tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
tmp12 = vaddq_s16(tmp12, z5);
tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsubq_s16(tmp11, tmp6);
tmp4 = vaddq_s16(tmp10, tmp5);
row0 = vaddq_s16(tmp0, tmp7);
row7 = vsubq_s16(tmp0, tmp7);
row1 = vaddq_s16(tmp1, tmp6);
row6 = vsubq_s16(tmp1, tmp6);
row2 = vaddq_s16(tmp2, tmp5);
row5 = vsubq_s16(tmp2, tmp5);
row4 = vaddq_s16(tmp3, tmp4);
row3 = vsubq_s16(tmp3, tmp4);
}
/* Transpose rows to work on columns in pass 2. */
int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
vreinterpretq_s32_s16(rows_45.val[0]));
int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
vreinterpretq_s32_s16(rows_45.val[1]));
int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
vreinterpretq_s32_s16(rows_67.val[0]));
int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
vreinterpretq_s32_s16(rows_67.val[1]));
int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
/* 1-D IDCT, pass 2 */
/* Even part */
int16x8_t tmp10 = vaddq_s16(col0, col4);
int16x8_t tmp11 = vsubq_s16(col0, col4);
int16x8_t tmp13 = vaddq_s16(col2, col6);
int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
tmp12 = vaddq_s16(tmp12, col2_sub_col6);
tmp12 = vsubq_s16(tmp12, tmp13);
int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
/* Odd part */
int16x8_t z13 = vaddq_s16(col5, col3);
int16x8_t neg_z10 = vsubq_s16(col3, col5);
int16x8_t z11 = vaddq_s16(col1, col7);
int16x8_t z12 = vsubq_s16(col1, col7);
int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
z5 = vaddq_s16(z5, z10_add_z12);
tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
tmp10 = vaddq_s16(tmp10, z12);
tmp10 = vsubq_s16(tmp10, z5);
tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
tmp12 = vaddq_s16(tmp12, z5);
int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
col0 = vaddq_s16(tmp0, tmp7);
col7 = vsubq_s16(tmp0, tmp7);
col1 = vaddq_s16(tmp1, tmp6);
col6 = vsubq_s16(tmp1, tmp6);
col2 = vaddq_s16(tmp2, tmp5);
col5 = vsubq_s16(tmp2, tmp5);
col4 = vaddq_s16(tmp3, tmp4);
col3 = vsubq_s16(tmp3, tmp4);
/* Scale down by a factor of 8, narrowing to 8-bit. */
int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
vqshrn_n_s16(col1, PASS1_BITS + 3));
int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
vqshrn_n_s16(col5, PASS1_BITS + 3));
int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
vqshrn_n_s16(col3, PASS1_BITS + 3));
int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
vqshrn_n_s16(col7, PASS1_BITS + 3));
/* Clamp to range [0-255]. */
uint8x16_t cols_01 =
vreinterpretq_u8_s8
(vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_45 =
vreinterpretq_u8_s8
(vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_23 =
vreinterpretq_u8_s8
(vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_67 =
vreinterpretq_u8_s8
(vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
/* Transpose block to prepare for store. */
uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
vreinterpretq_u32_u8(cols_45));
uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
vreinterpretq_u32_u8(cols_67));
uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
vreinterpretq_u8_u32(cols_0415.val[1]));
uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
vreinterpretq_u8_u32(cols_2637.val[1]));
uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
vreinterpretq_u16_u8(cols_2367.val[0]));
uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
vreinterpretq_u16_u8(cols_2367.val[1]));
uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
JSAMPROW outptr0 = output_buf[0] + output_col;
JSAMPROW outptr1 = output_buf[1] + output_col;
JSAMPROW outptr2 = output_buf[2] + output_col;
JSAMPROW outptr3 = output_buf[3] + output_col;
JSAMPROW outptr4 = output_buf[4] + output_col;
JSAMPROW outptr5 = output_buf[5] + output_col;
JSAMPROW outptr6 = output_buf[6] + output_col;
JSAMPROW outptr7 = output_buf[7] + output_col;
/* Store DCT block to memory. */
vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
}

View File

@@ -0,0 +1,801 @@
/*
* jidctint-neon.c - accurate integer IDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
#define CONST_BITS 13
#define PASS1_BITS 2
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
/* The computation of the inverse DCT requires the use of constants known at
* compile time. Scaled integer constants are used to avoid floating-point
* arithmetic:
* 0.298631336 = 2446 * 2^-13
* 0.390180644 = 3196 * 2^-13
* 0.541196100 = 4433 * 2^-13
* 0.765366865 = 6270 * 2^-13
* 0.899976223 = 7373 * 2^-13
* 1.175875602 = 9633 * 2^-13
* 1.501321110 = 12299 * 2^-13
* 1.847759065 = 15137 * 2^-13
* 1.961570560 = 16069 * 2^-13
* 2.053119869 = 16819 * 2^-13
* 2.562915447 = 20995 * 2^-13
* 3.072711026 = 25172 * 2^-13
*/
#define F_0_298 2446
#define F_0_390 3196
#define F_0_541 4433
#define F_0_765 6270
#define F_0_899 7373
#define F_1_175 9633
#define F_1_501 12299
#define F_1_847 15137
#define F_1_961 16069
#define F_2_053 16819
#define F_2_562 20995
#define F_3_072 25172
#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
F_0_899, F_0_541,
F_2_562, F_0_298_MINUS_0_899,
F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
F_0_541_PLUS_0_765, F_1_175,
F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
0, 0, 0, 0
};
/* Forward declaration of regular and sparse IDCT helper functions */
static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t row4,
int16x4_t row5,
int16x4_t row6,
int16x4_t row7,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16x4_t quant_row4,
int16x4_t quant_row5,
int16x4_t quant_row6,
int16x4_t quant_row7,
int16_t *workspace_1,
int16_t *workspace_2);
static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16_t *workspace_1,
int16_t *workspace_2);
static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset);
static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset);
/* Perform dequantization and inverse DCT on one block of coefficients. For
* reference, the C implementation (jpeg_idct_slow()) can be found in
* jidctint.c.
*
* Optimization techniques used for fast data access:
*
* In each pass, the inverse DCT is computed for the left and right 4x8 halves
* of the DCT block. This avoids spilling due to register pressure, and the
* increased granularity allows for an optimized calculation depending on the
* values of the DCT coefficients. Between passes, intermediate data is stored
* in 4x8 workspace buffers.
*
* Transposing the 8x8 DCT block after each pass can be achieved by transposing
* each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
* diagram below.) Swapping quadrants is cheap, since the second pass can just
* swap the workspace buffer pointers.
*
* +-------+-------+ +-------+-------+
* | | | | | |
* | 0 | 1 | | 0 | 2 |
* | | | transpose | | |
* +-------+-------+ ------> +-------+-------+
* | | | | | |
* | 2 | 3 | | 1 | 3 |
* | | | | | |
* +-------+-------+ +-------+-------+
*
* Optimization techniques used to accelerate the inverse DCT calculation:
*
* In a DCT coefficient block, the coefficients are increasingly likely to be 0
* as you move diagonally from top left to bottom right. If whole rows of
* coefficients are 0, then the inverse DCT calculation can be simplified. On
* the first pass of the inverse DCT, we test for three special cases before
* defaulting to a full "regular" inverse DCT:
*
* 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
* "sparse" simplified inverse DCT on rows 0-3.
* 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
* result is equal to the dequantized DC coefficients.
* 3) AC and DC coefficients are all zero. In this case, the inverse DCT
* result is all zero. For the left 4x8 half, this is handled identically
* to Case 2 above. For the right 4x8 half, we do no work and signal that
* the "sparse" algorithm is required for the second pass.
*
* In the second pass, only a single special case is tested: whether the AC and
* DC coefficients were all zero in the right 4x8 block during the first pass
* (refer to Case 3 above.) If this is the case, then a "sparse" variant of
* the second pass is performed for both the left and right halves of the DCT
* block. (The transposition after the first pass means that the right 4x8
* block during the first pass becomes rows 4-7 during the second pass.)
*/
void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
int16_t workspace_l[8 * DCTSIZE / 2];
int16_t workspace_r[8 * DCTSIZE / 2];
/* Compute IDCT first pass on left 4x8 coefficient block. */
/* Load DCT coefficients in left 4x8 block. */
int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table for left 4x8 block. */
int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
int16x4_t bitmap = vorr_s16(row7, row6);
bitmap = vorr_s16(bitmap, row5);
bitmap = vorr_s16(bitmap, row4);
int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (bitmap_rows_4567 == 0) {
bitmap = vorr_s16(bitmap, row3);
bitmap = vorr_s16(bitmap, row2);
bitmap = vorr_s16(bitmap, row1);
int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (left_ac_bitmap == 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
/* Store 4x4 blocks to workspace, transposing in the process. */
vst4_s16(workspace_l, quadrant);
vst4_s16(workspace_r, quadrant);
} else {
jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
quant_row1, quant_row2, quant_row3,
workspace_l, workspace_r);
}
} else {
jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
row6, row7, quant_row0, quant_row1,
quant_row2, quant_row3, quant_row4,
quant_row5, quant_row6, quant_row7,
workspace_l, workspace_r);
}
/* Compute IDCT first pass on right 4x8 coefficient block. */
/* Load DCT coefficients in right 4x8 block. */
row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
/* Load quantization table for right 4x8 block. */
quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
bitmap = vorr_s16(row7, row6);
bitmap = vorr_s16(bitmap, row5);
bitmap = vorr_s16(bitmap, row4);
bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
bitmap = vorr_s16(bitmap, row3);
bitmap = vorr_s16(bitmap, row2);
bitmap = vorr_s16(bitmap, row1);
int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
/* If this remains non-zero, a "regular" second pass will be performed. */
int64_t right_ac_dc_bitmap = 1;
if (right_ac_bitmap == 0) {
bitmap = vorr_s16(bitmap, row0);
right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (right_ac_dc_bitmap != 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
/* Store 4x4 blocks to workspace, transposing in the process. */
vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
}
} else {
if (bitmap_rows_4567 == 0) {
jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
quant_row1, quant_row2, quant_row3,
workspace_l + 4 * DCTSIZE / 2,
workspace_r + 4 * DCTSIZE / 2);
} else {
jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
row6, row7, quant_row0, quant_row1,
quant_row2, quant_row3, quant_row4,
quant_row5, quant_row6, quant_row7,
workspace_l + 4 * DCTSIZE / 2,
workspace_r + 4 * DCTSIZE / 2);
}
}
/* Second pass: compute IDCT on rows in workspace. */
/* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
if (right_ac_dc_bitmap == 0) {
jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
} else {
jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
}
}
/* Perform dequantization and the first pass of the accurate inverse DCT on a
* 4x8 block of coefficients. (To process the full 8x8 DCT block, this
* function-- or some other optimized variant-- needs to be called for both the
* left and right 4x8 blocks.)
*
* This "regular" version assumes that no optimization can be made to the IDCT
* calculation, since no useful set of AC coefficients is all 0.
*
* The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
* found in jidctint.c. Algorithmic changes made here are documented inline.
*/
static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t row4,
int16x4_t row5,
int16x4_t row6,
int16x4_t row7,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16x4_t quant_row4,
int16x4_t quant_row5,
int16x4_t quant_row6,
int16x4_t quant_row7,
int16_t *workspace_1,
int16_t *workspace_2)
{
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Even part */
int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
z2_s16 = vmul_s16(row0, quant_row0);
z3_s16 = vmul_s16(row4, quant_row4);
int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
/* Odd part */
int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
/* Implementation as per jpeg_idct_islow() in jidctint.c:
* z5 = (z3 + z4) * 1.175875602;
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
* z3 += z5; z4 += z5;
*
* This implementation:
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
*/
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
/* Implementation as per jpeg_idct_islow() in jidctint.c:
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
* tmp0 += z1 + z3; tmp1 += z2 + z4;
* tmp2 += z2 + z3; tmp3 += z1 + z4;
*
* This implementation:
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
* tmp0 += z3; tmp1 += z4;
* tmp2 += z3; tmp3 += z4;
*/
tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
tmp0 = vaddq_s32(tmp0, z3);
tmp1 = vaddq_s32(tmp1, z4);
tmp2 = vaddq_s32(tmp2, z3);
tmp3 = vaddq_s32(tmp3, z4);
/* Final output stage: descale and narrow to 16-bit. */
int16x4x4_t rows_0123 = { {
vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
} };
int16x4x4_t rows_4567 = { {
vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
} };
/* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
* (VST4 transposes the blocks. We need to operate on rows in the next
* pass.)
*/
vst4_s16(workspace_1, rows_0123);
vst4_s16(workspace_2, rows_4567);
}
/* Perform dequantization and the first pass of the accurate inverse DCT on a
* 4x8 block of coefficients.
*
* This "sparse" version assumes that the AC coefficients in rows 4-7 are all
* 0. This simplifies the IDCT calculation, accelerating overall performance.
*/
static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16_t *workspace_1,
int16_t *workspace_2)
{
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Even part (z3 is all 0) */
int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
z2_s16 = vmul_s16(row0, quant_row0);
int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
/* Odd part (tmp0 and tmp1 are both all 0) */
int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
int16x4_t z3_s16 = tmp2_s16;
int16x4_t z4_s16 = tmp3_s16;
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
/* Final output stage: descale and narrow to 16-bit. */
int16x4x4_t rows_0123 = { {
vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
} };
int16x4x4_t rows_4567 = { {
vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
} };
/* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
* (VST4 transposes the blocks. We need to operate on rows in the next
* pass.)
*/
vst4_s16(workspace_1, rows_0123);
vst4_s16(workspace_2, rows_4567);
}
/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
* coefficients. (To process the full 8x8 DCT block, this function-- or some
* other optimized variant-- needs to be called for both the right and left 4x8
* blocks.)
*
* This "regular" version assumes that no optimization can be made to the IDCT
* calculation, since no useful set of coefficient values are all 0 after the
* first pass.
*
* Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
* can be found in jidctint.c. Algorithmic changes made here are documented
* inline.
*/
static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset)
{
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Even part */
int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
/* Odd part */
int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
/* Implementation as per jpeg_idct_islow() in jidctint.c:
* z5 = (z3 + z4) * 1.175875602;
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
* z3 += z5; z4 += z5;
*
* This implementation:
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
*/
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
/* Implementation as per jpeg_idct_islow() in jidctint.c:
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
* tmp0 += z1 + z3; tmp1 += z2 + z4;
* tmp2 += z2 + z3; tmp3 += z1 + z4;
*
* This implementation:
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
* tmp0 += z3; tmp1 += z4;
* tmp2 += z3; tmp3 += z4;
*/
tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
tmp0 = vaddq_s32(tmp0, z3);
tmp1 = vaddq_s32(tmp1, z4);
tmp2 = vaddq_s32(tmp2, z3);
tmp3 = vaddq_s32(tmp3, z4);
/* Final output stage: descale and narrow to 16-bit. */
int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
vaddhn_s32(tmp12, tmp1));
int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
vaddhn_s32(tmp13, tmp0));
int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
vsubhn_s32(tmp11, tmp2));
int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
vsubhn_s32(tmp10, tmp3));
/* Descale and narrow to 8-bit. */
int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
/* Clamp to range [0-255]. */
uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
vdup_n_u8(CENTERJSAMPLE));
/* Transpose 4x8 block and store to memory. (Zipping adjacent columns
* together allows us to store 16-bit elements.)
*/
uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
uint16x4x4_t cols_01_23_45_67 = { {
vreinterpret_u16_u8(cols_01_23.val[0]),
vreinterpret_u16_u8(cols_01_23.val[1]),
vreinterpret_u16_u8(cols_45_67.val[0]),
vreinterpret_u16_u8(cols_45_67.val[1])
} };
JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
/* VST4 of 16-bit elements completes the transpose. */
vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
}
/* Performs the second pass of the accurate inverse DCT on a 4x8 block
* of coefficients.
*
* This "sparse" version assumes that the coefficient values (after the first
* pass) in rows 4-7 are all 0. This simplifies the IDCT calculation,
* accelerating overall performance.
*/
static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset)
{
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Even part (z3 is all 0) */
int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
/* Odd part (tmp0 and tmp1 are both all 0) */
int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
int16x4_t z3_s16 = tmp2_s16;
int16x4_t z4_s16 = tmp3_s16;
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
/* Final output stage: descale and narrow to 16-bit. */
int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
vaddhn_s32(tmp12, tmp1));
int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
vaddhn_s32(tmp13, tmp0));
int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
vsubhn_s32(tmp11, tmp2));
int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
vsubhn_s32(tmp10, tmp3));
/* Descale and narrow to 8-bit. */
int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
/* Clamp to range [0-255]. */
uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
vdup_n_u8(CENTERJSAMPLE));
/* Transpose 4x8 block and store to memory. (Zipping adjacent columns
* together allows us to store 16-bit elements.)
*/
uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
uint16x4x4_t cols_01_23_45_67 = { {
vreinterpret_u16_u8(cols_01_23.val[0]),
vreinterpret_u16_u8(cols_01_23.val[1]),
vreinterpret_u16_u8(cols_45_67.val[0]),
vreinterpret_u16_u8(cols_45_67.val[1])
} };
JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
/* VST4 of 16-bit elements completes the transpose. */
vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
}

View File

@@ -0,0 +1,486 @@
/*
* jidctred-neon.c - reduced-size IDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
#define CONST_BITS 13
#define PASS1_BITS 2
#define F_0_211 1730
#define F_0_509 4176
#define F_0_601 4926
#define F_0_720 5906
#define F_0_765 6270
#define F_0_850 6967
#define F_0_899 7373
#define F_1_061 8697
#define F_1_272 10426
#define F_1_451 11893
#define F_1_847 15137
#define F_2_172 17799
#define F_2_562 20995
#define F_3_624 29692
/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
* 2x2 output from an 8x8 DCT block. It uses the same calculations and
* produces exactly the same output as IJG's original jpeg_idct_2x2() function
* from jpeg-6b, which can be found in jidctred.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.720959822 = 5906 * 2^-13
* 0.850430095 = 6967 * 2^-13
* 1.272758580 = 10426 * 2^-13
* 3.624509785 = 29692 * 2^-13
*
* See jidctred.c for further details of the 2x2 IDCT algorithm. Where
* possible, the variable names and comments here in jsimd_idct_2x2_neon()
* match up with those in jpeg_idct_2x2().
*/
ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
-F_0_720, F_0_850, -F_1_272, F_3_624
};
void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Dequantize DCT coefficients. */
row0 = vmulq_s16(row0, quant_row0);
row1 = vmulq_s16(row1, quant_row1);
row3 = vmulq_s16(row3, quant_row3);
row5 = vmulq_s16(row5, quant_row5);
row7 = vmulq_s16(row7, quant_row7);
/* Load IDCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
/* Pass 1: process columns from input, put results in vectors row0 and
* row1.
*/
/* Even part */
int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
/* Odd part */
int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
/* Transpose two rows, ready for second pass. */
int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
int16x8_t cols_0246 = cols_0246_1357.val[0];
int16x8_t cols_1357 = cols_0246_1357.val[1];
/* Duplicate columns such that each is accessible in its own vector. */
int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
vreinterpretq_s32_s16(cols_1357));
int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
/* Pass 2: process two rows, store to output array. */
/* Even part: we're only interested in col0; the top half of tmp10 is "don't
* care."
*/
int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
/* Odd part: we're only interested in the bottom half of tmp0. */
int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
/* Final output stage: descale and clamp to range [0-255]. */
int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
vsubhn_s32(tmp10, tmp0));
output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
CONST_BITS + PASS1_BITS + 3 + 2 - 16);
/* Narrow to 8-bit and convert to unsigned. */
uint8x8_t output_u8 = vqmovun_s16(output_s16);
/* Store 2x2 block to memory. */
vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
}
/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
* 4x4 output from an 8x8 DCT block. It uses the same calculations and
* produces exactly the same output as IJG's original jpeg_idct_4x4() function
* from jpeg-6b, which can be found in jidctred.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.211164243 = 1730 * 2^-13
* 0.509795579 = 4176 * 2^-13
* 0.601344887 = 4926 * 2^-13
* 0.765366865 = 6270 * 2^-13
* 0.899976223 = 7373 * 2^-13
* 1.061594337 = 8697 * 2^-13
* 1.451774981 = 11893 * 2^-13
* 1.847759065 = 15137 * 2^-13
* 2.172734803 = 17799 * 2^-13
* 2.562915447 = 20995 * 2^-13
*
* See jidctred.c for further details of the 4x4 IDCT algorithm. Where
* possible, the variable names and comments here in jsimd_idct_4x4_neon()
* match up with those in jpeg_idct_4x4().
*/
ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
F_1_847, -F_0_765, -F_0_211, F_1_451,
-F_2_172, F_1_061, -F_0_509, -F_0_601,
F_0_899, F_2_562, 0, 0
};
void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values for DC coefficients. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
/* Dequantize DC coefficients. */
row0 = vmulq_s16(row0, quant_row0);
/* Construct bitmap to test if all AC coefficients are 0. */
int16x8_t bitmap = vorrq_s16(row1, row2);
bitmap = vorrq_s16(bitmap, row3);
bitmap = vorrq_s16(bitmap, row5);
bitmap = vorrq_s16(bitmap, row6);
bitmap = vorrq_s16(bitmap, row7);
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */
const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
/* All AC coefficients are zero.
* Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
*/
int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
row0 = dcval;
row1 = dcval;
row2 = dcval;
row3 = dcval;
} else if (left_ac_bitmap == 0) {
/* AC coefficients are zero for columns 0, 1, 2, and 3.
* Compute DC values for these columns.
*/
int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
/* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Even part */
int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
z2 = vmul_s16(vget_high_s16(row5), quant_row5);
z3 = vmul_s16(vget_high_s16(row3), quant_row3);
int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1));
row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1));
row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1));
row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1));
} else if (right_ac_bitmap == 0) {
/* AC coefficients are zero for columns 4, 5, 6, and 7.
* Compute DC values for these columns.
*/
int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
/* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Even part */
int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
z2 = vmul_s16(vget_low_s16(row5), quant_row5);
z3 = vmul_s16(vget_low_s16(row3), quant_row3);
int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1), dcval);
row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1), dcval);
row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1), dcval);
row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1), dcval);
} else {
/* All AC coefficients are non-zero; full IDCT calculation required. */
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Even part */
int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
int16x8_t z2 = vmulq_s16(row2, quant_row2);
int16x8_t z3 = vmulq_s16(row6, quant_row6);
int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
/* Odd part */
int16x8_t z1 = vmulq_s16(row7, quant_row7);
z2 = vmulq_s16(row5, quant_row5);
z3 = vmulq_s16(row3, quant_row3);
int16x8_t z4 = vmulq_s16(row1, quant_row1);
tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
CONST_BITS - PASS1_BITS + 1));
row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
CONST_BITS - PASS1_BITS + 1));
row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
CONST_BITS - PASS1_BITS + 1));
row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
CONST_BITS - PASS1_BITS + 1));
}
/* Transpose 8x4 block to perform IDCT on rows in second pass. */
int16x8x2_t row_01 = vtrnq_s16(row0, row1);
int16x8x2_t row_23 = vtrnq_s16(row2, row3);
int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
vreinterpretq_s32_s16(row_23.val[0]));
int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
vreinterpretq_s32_s16(row_23.val[1]));
int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
/* Commence second pass of IDCT. */
/* Even part */
int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
/* Final output stage: descale and clamp to range [0-255]. */
int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
vsubhn_s32(tmp12, tmp0));
int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
vsubhn_s32(tmp10, tmp2));
output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
/* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
* An interleaving store completes the transpose.
*/
uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
vqmovun_s16(output_cols_13));
uint16x4x2_t output_01_23 = { {
vreinterpret_u16_u8(output_0123.val[0]),
vreinterpret_u16_u8(output_0123.val[1])
} };
/* Store 4x4 block to memory. */
JSAMPROW outptr0 = output_buf[0] + output_col;
JSAMPROW outptr1 = output_buf[1] + output_col;
JSAMPROW outptr2 = output_buf[2] + output_col;
JSAMPROW outptr3 = output_buf[3] + output_col;
vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
}

View File

@@ -0,0 +1,193 @@
/*
* jquanti-neon.c - sample data conversion and quantization (Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include <arm_neon.h>
/* After downsampling, the resulting sample values are in the range [0, 255],
* but the Discrete Cosine Transform (DCT) operates on values centered around
* 0.
*
* To prepare sample values for the DCT, load samples into a DCT workspace,
* subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
* are also widened from 8- to 16-bit.
*
* The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
*/
void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
int16x8_t row0 =
vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row1 =
vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row2 =
vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row3 =
vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row4 =
vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row5 =
vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row6 =
vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row7 =
vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
vst1q_s16(workspace + 0 * DCTSIZE, row0);
vst1q_s16(workspace + 1 * DCTSIZE, row1);
vst1q_s16(workspace + 2 * DCTSIZE, row2);
vst1q_s16(workspace + 3 * DCTSIZE, row3);
vst1q_s16(workspace + 4 * DCTSIZE, row4);
vst1q_s16(workspace + 5 * DCTSIZE, row5);
vst1q_s16(workspace + 6 * DCTSIZE, row6);
vst1q_s16(workspace + 7 * DCTSIZE, row7);
}
/* After the DCT, the resulting array of coefficient values needs to be divided
* by an array of quantization values.
*
* To avoid a slow division operation, the DCT coefficients are multiplied by
* the (scaled) reciprocals of the quantization values and then right-shifted.
*
* The equivalent scalar C function quantize() can be found in jcdctmgr.c.
*/
void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
DCTELEM *workspace)
{
JCOEFPTR out_ptr = coef_block;
UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
int i;
#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
#pragma unroll
#endif
for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
/* Load reciprocals of quantization values. */
uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
/* Extract sign from coefficients. */
int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
/* Get absolute value of DCT coefficients. */
uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
/* Add correction. */
abs_row0 = vaddq_u16(abs_row0, corr0);
abs_row1 = vaddq_u16(abs_row1, corr1);
abs_row2 = vaddq_u16(abs_row2, corr2);
abs_row3 = vaddq_u16(abs_row3, corr3);
/* Multiply DCT coefficients by quantization reciprocals. */
int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
vget_low_u16(recip0)));
int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
vget_high_u16(recip0)));
int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
vget_low_u16(recip1)));
int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
vget_high_u16(recip1)));
int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
vget_low_u16(recip2)));
int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
vget_high_u16(recip2)));
int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
vget_low_u16(recip3)));
int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
vget_high_u16(recip3)));
/* Narrow back to 16-bit. */
row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
/* Since VSHR only supports an immediate as its second argument, negate the
* shift value and shift left.
*/
row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
vnegq_s16(shift0)));
row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
vnegq_s16(shift1)));
row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
vnegq_s16(shift2)));
row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
vnegq_s16(shift3)));
/* Restore sign to original product. */
row0 = veorq_s16(row0, sign_row0);
row0 = vsubq_s16(row0, sign_row0);
row1 = veorq_s16(row1, sign_row1);
row1 = vsubq_s16(row1, sign_row1);
row2 = veorq_s16(row2, sign_row2);
row2 = vsubq_s16(row2, sign_row2);
row3 = veorq_s16(row3, sign_row3);
row3 = vsubq_s16(row3, sign_row3);
/* Store quantized coefficients to memory. */
vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
}
}

View File

@@ -0,0 +1,37 @@
/*
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#cmakedefine HAVE_VLD1_S16_X3
#cmakedefine HAVE_VLD1_U16_X2
#cmakedefine HAVE_VLD1Q_U8_X4
/* Define compiler-independent count-leading-zeros and byte-swap macros */
#if defined(_MSC_VER) && !defined(__clang__)
#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x)
#define BUILTIN_BSWAP64(x) _byteswap_uint64(x)
#elif defined(__clang__) || defined(__GNUC__)
#define BUILTIN_CLZ(x) __builtin_clz(x)
#define BUILTIN_CLZLL(x) __builtin_clzll(x)
#define BUILTIN_BSWAP64(x) __builtin_bswap64(x)
#else
#error "Unknown compiler"
#endif