mirror of
https://github.com/Cateners/tiny_computer.git
synced 2026-05-21 08:55:48 +08:00
Update code to v1.0.14 (10)
This commit is contained in:
192
android/extern/libjpeg-turbo/simd/arm/jcsample-neon.c
vendored
Normal file
192
android/extern/libjpeg-turbo/simd/arm/jcsample-neon.c
vendored
Normal file
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
* jcsample-neon.c - downsampling (Arm Neon)
|
||||
*
|
||||
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../../jinclude.h"
|
||||
#include "../../jpeglib.h"
|
||||
#include "../../jsimd.h"
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
||||
ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
|
||||
0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
|
||||
0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
|
||||
0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
|
||||
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
|
||||
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
|
||||
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
|
||||
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
|
||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
|
||||
0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
|
||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
|
||||
0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
|
||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
|
||||
/* Downsample pixel values of a single component.
|
||||
* This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
* without smoothing.
|
||||
*/
|
||||
|
||||
void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
|
||||
JDIMENSION v_samp_factor,
|
||||
JDIMENSION width_in_blocks,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
JSAMPROW inptr, outptr;
|
||||
/* Load expansion mask to pad remaining elements of last DCT block. */
|
||||
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
|
||||
const uint8x16_t expand_mask =
|
||||
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
|
||||
/* Load bias pattern (alternating every pixel.) */
|
||||
/* { 0, 1, 0, 1, 0, 1, 0, 1 } */
|
||||
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
|
||||
unsigned i, outrow;
|
||||
|
||||
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
||||
outptr = output_data[outrow];
|
||||
inptr = input_data[outrow];
|
||||
|
||||
/* Downsample all but the last DCT block of pixels. */
|
||||
for (i = 0; i < width_in_blocks - 1; i++) {
|
||||
uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
|
||||
/* Add adjacent pixel values, widen to 16-bit, and add bias. */
|
||||
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
|
||||
/* Divide total by 2 and narrow to 8-bit. */
|
||||
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
|
||||
/* Store samples to memory. */
|
||||
vst1_u8(outptr + i * DCTSIZE, samples_u8);
|
||||
}
|
||||
|
||||
/* Load pixels in last DCT block into a table. */
|
||||
uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
/* Pad the empty elements with the value of the last pixel. */
|
||||
pixels = vqtbl1q_u8(pixels, expand_mask);
|
||||
#else
|
||||
uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
|
||||
pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
|
||||
vtbl2_u8(table, vget_high_u8(expand_mask)));
|
||||
#endif
|
||||
/* Add adjacent pixel values, widen to 16-bit, and add bias. */
|
||||
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
|
||||
/* Divide total by 2, narrow to 8-bit, and store. */
|
||||
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
|
||||
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Downsample pixel values of a single component.
|
||||
* This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
* without smoothing.
|
||||
*/
|
||||
|
||||
void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
|
||||
JDIMENSION v_samp_factor,
|
||||
JDIMENSION width_in_blocks,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
JSAMPROW inptr0, inptr1, outptr;
|
||||
/* Load expansion mask to pad remaining elements of last DCT block. */
|
||||
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
|
||||
const uint8x16_t expand_mask =
|
||||
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
|
||||
/* Load bias pattern (alternating every pixel.) */
|
||||
/* { 1, 2, 1, 2, 1, 2, 1, 2 } */
|
||||
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
|
||||
unsigned i, outrow;
|
||||
|
||||
for (outrow = 0; outrow < v_samp_factor; outrow++) {
|
||||
outptr = output_data[outrow];
|
||||
inptr0 = input_data[outrow];
|
||||
inptr1 = input_data[outrow + 1];
|
||||
|
||||
/* Downsample all but the last DCT block of pixels. */
|
||||
for (i = 0; i < width_in_blocks - 1; i++) {
|
||||
uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
|
||||
uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
|
||||
/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
|
||||
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
|
||||
/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
|
||||
*/
|
||||
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
|
||||
/* Divide total by 4 and narrow to 8-bit. */
|
||||
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
|
||||
/* Store samples to memory and increment pointers. */
|
||||
vst1_u8(outptr + i * DCTSIZE, samples_u8);
|
||||
}
|
||||
|
||||
/* Load pixels in last DCT block into a table. */
|
||||
uint8x16_t pixels_r0 =
|
||||
vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||
uint8x16_t pixels_r1 =
|
||||
vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
/* Pad the empty elements with the value of the last pixel. */
|
||||
pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
|
||||
pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
|
||||
#else
|
||||
uint8x8x2_t table_r0 =
|
||||
{ { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
|
||||
uint8x8x2_t table_r1 =
|
||||
{ { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
|
||||
pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
|
||||
vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
|
||||
pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
|
||||
vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
|
||||
#endif
|
||||
/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
|
||||
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
|
||||
/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
|
||||
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
|
||||
/* Divide total by 4, narrow to 8-bit, and store. */
|
||||
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
|
||||
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user