Update code to v1.0.14 (10)

2026-05-20 16:35:47 +08:00 · 2024-02-29 19:35:00 +08:00
parent c2ee3b694c
commit a956d26f6d
3188 changed files with 2317293 additions and 146 deletions
--- a/android/extern/libjpeg-turbo/simd/arm/aarch32/jccolext-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+  const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+  const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+  const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+  const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 8) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 8) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+      uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_low = scaled_128_5;
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+      cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+      uint32x4_t cb_high = scaled_128_5;
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+      cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_low = scaled_128_5;
+      cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+      uint32x4_t cr_high = scaled_128_5;
+      cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+                                      vrshrn_n_u32(y_high, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+                                       vshrn_n_u32(cb_high, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+                                       vshrn_n_u32(cr_high, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+      /* Increment pointers. */
+      inptr += (8 * RGB_PIXELSIZE);
+      outptr0 += 8;
+      outptr1 += 8;
+      outptr2 += 8;
+    }
+  }
+}
--- a/android/extern/libjpeg-turbo/simd/arm/aarch32/jchuff-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint8_t block_nbits[DCTSIZE2];
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load rows of coefficients from DCT block in zig-zag order. */
+
+  /* Compute DC coefficient difference value. (F.1.1.5.1) */
+  int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+  row0 = vld1q_lane_s16(block +  1, row0, 1);
+  row0 = vld1q_lane_s16(block +  8, row0, 2);
+  row0 = vld1q_lane_s16(block + 16, row0, 3);
+  row0 = vld1q_lane_s16(block +  9, row0, 4);
+  row0 = vld1q_lane_s16(block +  2, row0, 5);
+  row0 = vld1q_lane_s16(block +  3, row0, 6);
+  row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+  int16x8_t row1 = vld1q_dup_s16(block + 17);
+  row1 = vld1q_lane_s16(block + 24, row1, 1);
+  row1 = vld1q_lane_s16(block + 32, row1, 2);
+  row1 = vld1q_lane_s16(block + 25, row1, 3);
+  row1 = vld1q_lane_s16(block + 18, row1, 4);
+  row1 = vld1q_lane_s16(block + 11, row1, 5);
+  row1 = vld1q_lane_s16(block +  4, row1, 6);
+  row1 = vld1q_lane_s16(block +  5, row1, 7);
+
+  int16x8_t row2 = vld1q_dup_s16(block + 12);
+  row2 = vld1q_lane_s16(block + 19, row2, 1);
+  row2 = vld1q_lane_s16(block + 26, row2, 2);
+  row2 = vld1q_lane_s16(block + 33, row2, 3);
+  row2 = vld1q_lane_s16(block + 40, row2, 4);
+  row2 = vld1q_lane_s16(block + 48, row2, 5);
+  row2 = vld1q_lane_s16(block + 41, row2, 6);
+  row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+  int16x8_t row3 = vld1q_dup_s16(block + 27);
+  row3 = vld1q_lane_s16(block + 20, row3, 1);
+  row3 = vld1q_lane_s16(block + 13, row3, 2);
+  row3 = vld1q_lane_s16(block +  6, row3, 3);
+  row3 = vld1q_lane_s16(block +  7, row3, 4);
+  row3 = vld1q_lane_s16(block + 14, row3, 5);
+  row3 = vld1q_lane_s16(block + 21, row3, 6);
+  row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  int16x8_t row1_lz = vclzq_s16(abs_row1);
+  int16x8_t row2_lz = vclzq_s16(abs_row2);
+  int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+  uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+  uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+  uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+  vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+  vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+  vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+  vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+  uint16x8_t row0_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+              vnegq_s16(row0_lz));
+  uint16x8_t row1_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+              vnegq_s16(row1_lz));
+  uint16x8_t row2_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+              vnegq_s16(row2_lz));
+  uint16x8_t row3_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+              vnegq_s16(row3_lz));
+
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+  uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+  uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+  /* Store diff values for rows 0, 1, 2, and 3. */
+  vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+  vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+  vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+  vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+  /* Load last four rows of coefficients from DCT block in zig-zag order. */
+  int16x8_t row4 = vld1q_dup_s16(block + 35);
+  row4 = vld1q_lane_s16(block + 42, row4, 1);
+  row4 = vld1q_lane_s16(block + 49, row4, 2);
+  row4 = vld1q_lane_s16(block + 56, row4, 3);
+  row4 = vld1q_lane_s16(block + 57, row4, 4);
+  row4 = vld1q_lane_s16(block + 50, row4, 5);
+  row4 = vld1q_lane_s16(block + 43, row4, 6);
+  row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+  int16x8_t row5 = vld1q_dup_s16(block + 29);
+  row5 = vld1q_lane_s16(block + 22, row5, 1);
+  row5 = vld1q_lane_s16(block + 15, row5, 2);
+  row5 = vld1q_lane_s16(block + 23, row5, 3);
+  row5 = vld1q_lane_s16(block + 30, row5, 4);
+  row5 = vld1q_lane_s16(block + 37, row5, 5);
+  row5 = vld1q_lane_s16(block + 44, row5, 6);
+  row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+  int16x8_t row6 = vld1q_dup_s16(block + 58);
+  row6 = vld1q_lane_s16(block + 59, row6, 1);
+  row6 = vld1q_lane_s16(block + 52, row6, 2);
+  row6 = vld1q_lane_s16(block + 45, row6, 3);
+  row6 = vld1q_lane_s16(block + 38, row6, 4);
+  row6 = vld1q_lane_s16(block + 31, row6, 5);
+  row6 = vld1q_lane_s16(block + 39, row6, 6);
+  row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+  int16x8_t row7 = vld1q_dup_s16(block + 53);
+  row7 = vld1q_lane_s16(block + 60, row7, 1);
+  row7 = vld1q_lane_s16(block + 61, row7, 2);
+  row7 = vld1q_lane_s16(block + 54, row7, 3);
+  row7 = vld1q_lane_s16(block + 47, row7, 4);
+  row7 = vld1q_lane_s16(block + 55, row7, 5);
+  row7 = vld1q_lane_s16(block + 62, row7, 6);
+  row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  int16x8_t row4_lz = vclzq_s16(abs_row4);
+  int16x8_t row5_lz = vclzq_s16(abs_row5);
+  int16x8_t row6_lz = vclzq_s16(abs_row6);
+  int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+  uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+  uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+  uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+  vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+  vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+  vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+  vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+  uint16x8_t row4_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+              vnegq_s16(row4_lz));
+  uint16x8_t row5_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+              vnegq_s16(row5_lz));
+  uint16x8_t row6_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+              vnegq_s16(row6_lz));
+  uint16x8_t row7_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+              vnegq_s16(row7_lz));
+
+  uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+  uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+  uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+  uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+  /* Store diff values for rows 4, 5, 6, and 7. */
+  vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+  vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+  vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+  vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+  uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+  uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+  uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+  uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+  uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+  uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+  uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+  row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+  row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+  row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+  row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+  row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+  row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+  row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+  row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+  /* Move bitmap to 32-bit scalar registers. */
+  uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+  uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  unsigned int nbits = block_nbits[0];
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = block_diff[0];
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  while (bitmap_1_32 != 0) {
+    r = BUILTIN_CLZ(bitmap_1_32);
+    i += r;
+    bitmap_1_32 <<= r;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    i++;
+    bitmap_1_32 <<= 1;
+  }
+
+  r = 33 - i;
+  i = 33;
+
+  while (bitmap_33_63 != 0) {
+    unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+    r += leading_zeros;
+    i += leading_zeros;
+    bitmap_33_63 <<= leading_zeros;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    r = 0;
+    i++;
+    bitmap_33_63 <<= 1;
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
--- a/android/extern/libjpeg-turbo/simd/arm/aarch32/jsimd.c
+++ b/android/extern/libjpeg-turbo/simd/arm/aarch32/jsimd.c
@@ -0,0 +1,976 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <ctype.h>
+
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "Features", 8) != 0)
+    return 0;
+  buffer += 8;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "neon"))
+        simd_support |= JSIMD_NEON;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ARM_NEON__)
+  simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  /* We still have a chance to use Neon regardless of globally used
+   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux/android */
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_ycc_extbgr_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start, Sl,
+                                                 Al, absvalues, bits);
+}
--- a/android/extern/libjpeg-turbo/simd/arm/aarch32/jsimd_neon.S
+++ b/android/extern/libjpeg-turbo/simd/arm/aarch32/jsimd_neon.S
--- a/android/extern/libjpeg-turbo/simd/arm/aarch64/jccolext-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+  const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr0 += 16;
+      outptr1 += 16;
+      outptr2 += 16;
+    }
+
+    if (cols_remaining > 8) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+    } else if (cols_remaining > 0) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+      uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_l = scaled_128_5;
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+      cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+      uint32x4_t cb_h = scaled_128_5;
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+      cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_l = scaled_128_5;
+      cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+      uint32x4_t cr_h = scaled_128_5;
+      cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+                                      vrshrn_n_u32(y_h, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+                                       vshrn_n_u32(cb_h, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+                                       vshrn_n_u32(cr_h, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+    }
+  }
+}
--- a/android/extern/libjpeg-turbo/simd/arm/aarch64/jchuff-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,411 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+    0,   1,   2,   3,  16,  17,  32,  33,
+   18,  19,   4,   5,   6,   7,  20,  21,
+   34,  35,  48,  49, 255, 255,  50,  51,
+   36,  37,  22,  23,   8,   9,  10,  11,
+  255, 255,   6,   7,  20,  21,  34,  35,
+   48,  49, 255, 255,  50,  51,  36,  37,
+   54,  55,  40,  41,  26,  27,  12,  13,
+   14,  15,  28,  29,  42,  43,  56,  57,
+    6,   7,  20,  21,  34,  35,  48,  49,
+   50,  51,  36,  37,  22,  23,   8,   9,
+   26,  27,  12,  13, 255, 255,  14,  15,
+   28,  29,  42,  43,  56,  57, 255, 255,
+   52,  53,  54,  55,  40,  41,  26,  27,
+   12,  13, 255, 255,  14,  15,  28,  29,
+   26,  27,  40,  41,  42,  43,  28,  29,
+   14,  15,  30,  31,  44,  45,  46,  47
+};
+
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address.  That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const uint8x16x4_t idx_rows_0123 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+  const uint8x16x4_t idx_rows_4567 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+  /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+  const uint8x16x4_t idx_rows_0123 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+  } };
+  const uint8x16x4_t idx_rows_4567 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+  } };
+#endif
+
+  /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const int8x16x4_t tbl_rows_0123 =
+    vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+  const int8x16x4_t tbl_rows_4567 =
+    vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+  const int8x16x4_t tbl_rows_0123 = { {
+    vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+  } };
+  const int8x16x4_t tbl_rows_4567 = { {
+    vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+  } };
+#endif
+
+  /* Initialise extra lookup tables. */
+  const int8x16x4_t tbl_rows_2345 = { {
+    tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+    tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+  } };
+  const int8x16x3_t tbl_rows_567 =
+    { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+  /* Shuffle coefficients into zig-zag order. */
+  int16x8_t row0 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+  int16x8_t row1 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+  int16x8_t row2 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+  int16x8_t row3 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+  int16x8_t row4 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+  int16x8_t row5 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+  int16x8_t row6 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+  int16x8_t row7 =
+    vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+  /* Compute DC coefficient difference value (F.1.1.5.1). */
+  row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+  /* Initialize AC coefficient lanes not reachable by lookup tables. */
+  row1 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+                                  0), row1, 2);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  4), row2, 0);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  0), row2, 5);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  7), row5, 2);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  3), row5, 7);
+  row6 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+                                  7), row6, 5);
+
+  /* DCT block is now in zig-zag order; start Huffman encoding process. */
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+  uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+  uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+  uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+  uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+  uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+  uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+  uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+  uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+                                    vreinterpretq_u8_u16(row0_ne_0));
+  uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+                                    vreinterpretq_u8_u16(row2_ne_0));
+  uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+                                    vreinterpretq_u8_u16(row4_ne_0));
+  uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+                                    vreinterpretq_u8_u16(row6_ne_0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x16_t bitmap_mask =
+    vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
+
+  uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
+
+  uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+                                              bitmap_rows_3210);
+  uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+                                  vget_high_u8(bitmap_rows_76543210));
+
+  /* Shift left to remove DC bit. */
+  bitmap_all =
+    vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+  /* Count bits set (number of non-zero coefficients) in bitmap. */
+  unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+  /* Move bitmap to 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  /* Find nbits required to specify sign and amplitude of coefficient. */
+  unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+  unsigned int nbits = 16 - lz;
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = vgetq_lane_u16(row0_diff, 0);
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  /* The most efficient method of computing nbits and diff depends on the
+   * number of non-zero coefficients.  If the bitmap is not too sparse (> 8
+   * non-zero AC coefficients), it is beneficial to do all of the work using
+   * Neon; else we do some of the work using Neon and the rest on demand using
+   * scalar code.
+   */
+  if (non_zero_coefficients > 8) {
+    uint8_t block_nbits[DCTSIZE2];
+
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    int16x8_t row1_lz = vclzq_s16(abs_row1);
+    int16x8_t row2_lz = vclzq_s16(abs_row2);
+    int16x8_t row3_lz = vclzq_s16(abs_row3);
+    int16x8_t row4_lz = vclzq_s16(abs_row4);
+    int16x8_t row5_lz = vclzq_s16(abs_row5);
+    int16x8_t row6_lz = vclzq_s16(abs_row6);
+    int16x8_t row7_lz = vclzq_s16(abs_row7);
+    /* Narrow leading zero count to 8 bits. */
+    uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+                                    vreinterpretq_u8_s16(row1_lz));
+    uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+                                    vreinterpretq_u8_s16(row3_lz));
+    uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+                                    vreinterpretq_u8_s16(row5_lz));
+    uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+                                    vreinterpretq_u8_s16(row7_lz));
+    /* Compute nbits needed to specify magnitude of each coefficient. */
+    uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+    uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+    uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+    uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
+    /* Store nbits. */
+    vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+    vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+    vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+    vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
+    /* Mask bits not required to specify sign and amplitude of diff. */
+    uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+    uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+    uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+    uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+    uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+    uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+    uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+    /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     row1_mask);
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     row2_mask);
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     row3_mask);
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     row4_mask);
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     row5_mask);
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     row6_mask);
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     row7_mask);
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      nbits = block_nbits[i];
+      diff = block_diff[i];
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  } else if (bitmap != 0) {
+    uint16_t block_abs[DCTSIZE2];
+    /* Compute and store absolute value of coefficients. */
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+    vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+    vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+    vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+    vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+    vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+    vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+    vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+    /* Compute diff bits (without nbits mask) and store. */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     vcltzq_s16(row1));
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     vcltzq_s16(row2));
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     vcltzq_s16(row3));
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     vcltzq_s16(row4));
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     vcltzq_s16(row5));
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     vcltzq_s16(row6));
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     vcltzq_s16(row7));
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    /* Same as above but must mask diff bits and compute nbits on demand. */
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      lz = BUILTIN_CLZ(block_abs[i]);
+      nbits = 32 - lz;
+      diff = ((unsigned int)block_diff[i] << lz) >> lz;
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
--- a/android/extern/libjpeg-turbo/simd/arm/aarch64/jsimd.c
+++ b/android/extern/libjpeg-turbo/simd/arm/aarch64/jsimd.c
--- a/android/extern/libjpeg-turbo/simd/arm/aarch64/jsimd_neon.S
+++ b/android/extern/libjpeg-turbo/simd/arm/aarch64/jsimd_neon.S
--- a/android/extern/libjpeg-turbo/simd/arm/align.h
+++ b/android/extern/libjpeg-turbo/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment)  __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment)  __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
--- a/android/extern/libjpeg-turbo/simd/arm/jccolor-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+#define F_0_168  11059
+#define F_0_331  21709
+#define F_0_500  32768
+#define F_0_418  27439
+#define F_0_081  5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+  F_0_298, F_0_587, F_0_113, F_0_168,
+  F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
--- a/android/extern/libjpeg-turbo/simd/arm/jcgray-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
--- a/android/extern/libjpeg-turbo/simd/arm/jcgryext-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                 JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                 int num_rows)
+{
+  JSAMPROW inptr;
+  JSAMPROW outptr;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 16) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 16) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+      uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+      uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+      uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+
+      /* Narrow Y values to 8-bit and store to memory.  Buffer overwrite is
+       * permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr += 16;
+    }
+  }
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jchuff.h
+++ b/android/extern/libjpeg-turbo/simd/arm/jchuff.h
@@ -0,0 +1,131 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE  64
+#else
+#define BIT_BUF_SIZE  32
+#endif
+
+typedef struct {
+  size_t put_buffer;                    /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+    buffer += 8; \
+  } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+  buffer += 4; \
+}
+#else
+#define SPLAT() { \
+  put_buffer = __builtin_bswap32(put_buffer); \
+  __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    SPLAT(); \
+  } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+  diff |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(diff, nbits) \
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jcphuff-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jcphuff-neon.c
@@ -0,0 +1,623 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2022, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2022, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   UJCOEF *values, size_t *zerobits)
+{
+  UJCOEF *values_ptr = values;
+  UJCOEF *diff_values_ptr = values + DCTSIZE2;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Isolate sign of coefficients. */
+    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_u16(values_ptr, abs_coefs1);
+    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+    vst1q_u16(diff_values_ptr, diff1);
+    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_u16(values_ptr, abs_coefs1);
+    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+    vst1q_u16(diff_values_ptr, diff1);
+    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_u16(values_ptr, abs_coefs);
+    vst1q_u16(diff_values_ptr, diff);
+    values_ptr += 8;
+    diff_values_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in the values and diff_values blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_u16(values_ptr, vdupq_n_u16(0));
+    vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
+    values_ptr += 8;
+    diff_values_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap.  A set bit means that the corresponding
+   * coefficient != 0.
+   */
+  uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
+  uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
+  uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
+  uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
+  uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
+  uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
+  uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
+  uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  *zerobits = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  zerobits[0] = ~bitmap0;
+  zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   UJCOEF *absvalues, size_t *bits)
+{
+  /* Temporary storage buffers for data used to compute the signbits bitmap and
+   * the end-of-block (EOB) position
+   */
+  uint8_t coef_sign_bits[64];
+  uint8_t coef_eq1_bits[64];
+
+  UJCOEF *absvalues_ptr = absvalues;
+  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+  uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs1);
+    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs1);
+    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
+    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap. */
+  uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
+  uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
+  uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
+  uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
+  uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
+  uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
+  uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
+  uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap0;
+  bits[1] = ~bitmap1;
+#endif
+
+  /* Construct signbits bitmap. */
+  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store signbits bitmap. */
+  bits[1] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store signbits bitmap. */
+  bits[2] = ~bitmap0;
+  bits[3] = ~bitmap1;
+#endif
+
+  /* Construct bitmap to find EOB position (the index of the last coefficient
+   * equal to 1.)
+   */
+  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Return EOB position. */
+  if (bitmap == 0) {
+    /* EOB position is defined to be 0 if all coefficients != 1. */
+    return 0;
+  } else {
+    return 63 - BUILTIN_CLZLL(bitmap);
+  }
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+  /* Return EOB position. */
+  if (bitmap0 == 0 && bitmap1 == 0) {
+    return 0;
+  } else if (bitmap1 != 0) {
+    return 63 - BUILTIN_CLZ(bitmap1);
+  } else {
+    return 31 - BUILTIN_CLZ(bitmap0);
+  }
+#endif
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jcsample-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
+  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
+  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
+  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
+  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
+  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
+  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
+  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
+  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
+  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+      /* Divide total by 2 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+      /* Store samples to memory. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+                         vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+    /* Divide total by 2, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr0, inptr1, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr0 = input_data[outrow];
+    inptr1 = input_data[outrow + 1];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+       */
+      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+      /* Divide total by 4 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+      /* Store samples to memory and increment pointers. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels_r0 =
+      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+    uint8x16_t pixels_r1 =
+      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+    uint8x8x2_t table_r0 =
+      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+    uint8x8x2_t table_r1 =
+      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+    /* Divide total by 4, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jdcolext-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jdcolext-neon.c
@@ -0,0 +1,374 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+                                JDIMENSION input_row, JSAMPARRAY output_buf,
+                                int num_rows)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    int cols_remaining = output_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+      uint8x16_t y  = vld1q_u8(inptr0);
+      uint8x16_t cb = vld1q_u8(inptr1);
+      uint8x16_t cr = vld1q_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cr)));
+      int16x8_t cr_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cr)));
+      int16x8_t cb_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cb)));
+      int16x8_t cb_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cb)));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+                                            consts, 0);
+      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+                                            consts, 0);
+      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+                                  consts, 1);
+      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+                                  consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+                                         vrshrn_n_s32(g_sub_y_lh, 15));
+      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+                                         vrshrn_n_s32(g_sub_y_hh, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+                                               consts, 2);
+      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+                                               consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+                                               consts, 3);
+      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+                                               consts, 3);
+      /* Add Y. */
+      int16x8_t r_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t r_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t b_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t b_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t g_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t g_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+                                       vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x16x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Store RGB pixel data to memory. */
+      vst3q_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 16;
+      inptr1 += 16;
+      inptr2 += 16;
+      outptr += (RGB_PIXELSIZE * 16);
+    }
+
+    if (cols_remaining >= 8) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      vst3_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 8;
+      inptr1 += 8;
+      inptr2 += 8;
+      outptr += (RGB_PIXELSIZE * 8);
+      cols_remaining -= 8;
+    }
+
+    /* Handle the tail elements. */
+    if (cols_remaining > 0) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst4_lane_u8(outptr, rgba, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst3_lane_u8(outptr, rgb, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#endif
+    }
+  }
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jdcolor-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jdcolor-neon.c
@@ -0,0 +1,141 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
--- a/android/extern/libjpeg-turbo/simd/arm/jdmerge-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jdmerge-neon.c
@@ -0,0 +1,144 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
--- a/android/extern/libjpeg-turbo/simd/arm/jdmrgext-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,723 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba;
+    rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr, rgba);
+#else
+    uint8x16x3_t rgb;
+    rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr, rgb);
+#endif
+
+    /* Increment pointers. */
+    inptr0 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba_h;
+    rgba_h.val[RGB_RED] = r.val[1];
+    rgba_h.val[RGB_GREEN] = g.val[1];
+    rgba_h.val[RGB_BLUE] = b.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    uint8x8x4_t rgba_l;
+    rgba_l.val[RGB_RED] = r.val[0];
+    rgba_l.val[RGB_GREEN] = g.val[0];
+    rgba_l.val[RGB_BLUE] = b.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr, rgba_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr, rgba_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb_h;
+    rgb_h.val[RGB_RED] = r.val[1];
+    rgb_h.val[RGB_GREEN] = g.val[1];
+    rgb_h.val[RGB_BLUE] = b.val[1];
+    uint8x8x3_t rgb_l;
+    rgb_l.val[RGB_RED] = r.val[0];
+    rgb_l.val[RGB_GREEN] = g.val[0];
+    rgb_l.val[RGB_BLUE] = b.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr, rgb_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr, rgb_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr0, outptr1;
+  /* Pointers to Y (both rows), Cb, and Cr data */
+  JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+  inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba0, rgba1;
+    rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr0, rgba0);
+    vst4q_u8(outptr1, rgba1);
+#else
+    uint8x16x3_t rgb0, rgb1;
+    rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr0, rgb0);
+    vst3q_u8(outptr1, rgb1);
+#endif
+
+    /* Increment pointers. */
+    inptr0_0 += 16;
+    inptr0_1 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr0 += (RGB_PIXELSIZE * 16);
+    outptr1 += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba0_h, rgba1_h;
+    rgba0_h.val[RGB_RED] = r0.val[1];
+    rgba1_h.val[RGB_RED] = r1.val[1];
+    rgba0_h.val[RGB_GREEN] = g0.val[1];
+    rgba1_h.val[RGB_GREEN] = g1.val[1];
+    rgba0_h.val[RGB_BLUE] = b0.val[1];
+    rgba1_h.val[RGB_BLUE] = b1.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+    uint8x8x4_t rgba0_l, rgba1_l;
+    rgba0_l.val[RGB_RED] = r0.val[0];
+    rgba1_l.val[RGB_RED] = r1.val[0];
+    rgba0_l.val[RGB_GREEN] = g0.val[0];
+    rgba1_l.val[RGB_GREEN] = g1.val[0];
+    rgba0_l.val[RGB_BLUE] = b0.val[0];
+    rgba1_l.val[RGB_BLUE] = b1.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+      vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+      vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+      vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+      vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+      vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+      vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+      vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr0, rgba0_l);
+      vst4_u8(outptr1, rgba1_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+      vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+      vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+      vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+      vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+      vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+      vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr0, rgba0_l, 0);
+      vst4_lane_u8(outptr1, rgba1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb0_h, rgb1_h;
+    rgb0_h.val[RGB_RED] = r0.val[1];
+    rgb1_h.val[RGB_RED] = r1.val[1];
+    rgb0_h.val[RGB_GREEN] = g0.val[1];
+    rgb1_h.val[RGB_GREEN] = g1.val[1];
+    rgb0_h.val[RGB_BLUE] = b0.val[1];
+    rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+    uint8x8x3_t rgb0_l, rgb1_l;
+    rgb0_l.val[RGB_RED] = r0.val[0];
+    rgb1_l.val[RGB_RED] = r1.val[0];
+    rgb0_l.val[RGB_GREEN] = g0.val[0];
+    rgb1_l.val[RGB_GREEN] = g1.val[0];
+    rgb0_l.val[RGB_BLUE] = b0.val[0];
+    rgb1_l.val[RGB_BLUE] = b1.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+      vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+      vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+      vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+      vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+      vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+      vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+      vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr0, rgb0_l);
+      vst3_u8(outptr1, rgb1_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+      vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+      vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+      vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+      vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+      vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+      vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr0, rgb0_l, 0);
+      vst3_lane_u8(outptr1, rgb1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jdsample-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            |         |         |         |
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *            |         |         |         |
+ *            +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ *     p0(upsampled) = s0
+ *     p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    /* First pixel component value in this row of the original image */
+    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+    /*    3/4 * containing sample + 1/4 * nearest neighboring sample
+     * For p1: containing sample = s0, nearest neighboring sample = s1
+     * For p2: containing sample = s1, nearest neighboring sample = s0
+     */
+    uint8x16_t s0 = vld1q_u8(inptr);
+    uint8x16_t s1 = vld1q_u8(inptr + 1);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s1_add_3s0_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+    uint16x8_t s1_add_3s0_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+    uint16x8_t s0_add_3s1_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+    uint16x8_t s0_add_3s1_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+    /* Add ordered dithering bias to odd pixel values. */
+    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+    /* The offset is initially 1, because the first pixel component has already
+     * been stored.  However, in subsequent iterations of the SIMD loop, this
+     * offset is (2 * colctr - 1) to stay within the bounds of the sample
+     * buffers without having to resort to a slow scalar tail case for the last
+     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
+     * in jmemmgr.c for more details.
+     */
+    unsigned outptr_offset = 1;
+    uint8x16x2_t output_pixels;
+
+    /* We use software pipelining to maximise performance.  The code indented
+     * an extra two spaces begins the next iteration of the loop.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+        s0 = vld1q_u8(inptr + colctr - 1);
+        s1 = vld1q_u8(inptr + colctr);
+
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                         vrshrn_n_u16(s1_add_3s0_h, 2));
+      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                         vshrn_n_u16(s0_add_3s1_h, 2));
+
+        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+         * denote low half and high half respectively.
+         */
+        s1_add_3s0_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+        s1_add_3s0_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+        s0_add_3s1_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+        s0_add_3s1_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+        /* Add ordered dithering bias to odd pixel values. */
+        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr + outptr_offset, output_pixels);
+      outptr_offset = 2 * colctr - 1;
+    }
+
+    /* Complete the last iteration of the loop. */
+
+    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                       vrshrn_n_u16(s1_add_3s0_h, 2));
+    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                       vshrn_n_u16(s0_add_3s1_h, 2));
+    /* Store pixel component values to memory. */
+    vst2q_u8(outptr + outptr_offset, output_pixels);
+
+    /* Last pixel component value in this row of the original image */
+    outptr[2 * downsampled_width - 1] =
+      GETJSAMPLE(inptr[downsampled_width - 1]);
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *       sA   |         |         |         |
+ *            | p6   p7 | p8   p9 | p10  p11|
+ *            +---------+---------+---------+
+ *            | p12  p13| p14  p15| p16  p17|
+ *       sB   |         |         |         |
+ *            | p18  p19| p20  p21| p22  p23|
+ *            +---------+---------+---------+
+ *            | p24  p25| p26  p27| p28  p29|
+ *       sC   |         |         |         |
+ *            | p30  p31| p32  p33| p34  p35|
+ *            +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above.  To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1.  For example:
+ *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ *                      1/4 * (3/4 * s0B + 1/4 * s0A)
+ *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ *     p0(upsampled) = s0A
+ *     p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t seven_u16 = vdupq_n_u16(7);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+  const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    /* First pixel component value in this row of the original image */
+    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+    /* Step 1: Blend samples vertically in columns s0 and s1.
+     * Leave the divide by 4 until the end, when it can be done for both
+     * dimensions at once, right-shifting by 4.
+     */
+
+    /* Load and compute s0colsum0 and s0colsum1. */
+    uint8x16_t s0A = vld1q_u8(inptr0);
+    uint8x16_t s0B = vld1q_u8(inptr1);
+    uint8x16_t s0C = vld1q_u8(inptr2);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+                                      vget_high_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+                                      vget_high_u8(s0B), three_u8);
+    /* Load and compute s1colsum0 and s1colsum1. */
+    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+                                      vget_high_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+                                      vget_high_u8(s1B), three_u8);
+
+    /* Step 2: Blend the already-blended columns. */
+
+    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+    /* Add ordered dithering bias to odd pixel values. */
+    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+    uint8x16x2_t output_pixels0 = { {
+      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+    } };
+    uint8x16x2_t output_pixels1 = { {
+      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+    } };
+
+    /* Store pixel component values to memory.
+     * The minimum size of the output buffer for each row is 64 bytes => no
+     * need to worry about buffer overflow here.  See "Creation of 2-D sample
+     * arrays" in jmemmgr.c for more details.
+     */
+    vst2q_u8(outptr0 + 1, output_pixels0);
+    vst2q_u8(outptr1 + 1, output_pixels1);
+
+    /* The first pixel of the image shifted our loads and stores by one byte.
+     * We have to re-align on a 32-byte boundary at some point before the end
+     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+     * bounds of the sample buffers without having to resort to a slow scalar
+     * tail case for the last (downsampled_width % 16) samples.  See "Creation
+     * of 2-D sample arrays" in jmemmgr.c for more details.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+      /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+      /* Load and compute s0colsum0 and s0colsum1. */
+      s0A = vld1q_u8(inptr0 + colctr - 1);
+      s0B = vld1q_u8(inptr1 + colctr - 1);
+      s0C = vld1q_u8(inptr2 + colctr - 1);
+      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+                             three_u8);
+      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+                             three_u8);
+      /* Load and compute s1colsum0 and s1colsum1. */
+      s1A = vld1q_u8(inptr0 + colctr);
+      s1B = vld1q_u8(inptr1 + colctr);
+      s1C = vld1q_u8(inptr2 + colctr);
+      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+                             three_u8);
+      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+                             three_u8);
+
+      /* Step 2: Blend the already-blended columns. */
+
+      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+      /* Add ordered dithering bias to odd pixel values. */
+      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                          vshrn_n_u16(output0_p1_h, 4));
+      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                          vrshrn_n_u16(output0_p2_h, 4));
+      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                          vshrn_n_u16(output1_p1_h, 4));
+      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                          vrshrn_n_u16(output1_p2_h, 4));
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+    }
+
+    /* Last pixel component value in this row of the original image */
+    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr0[downsampled_width - 1]);
+    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr2[downsampled_width - 1]);
+    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+    inrow++;
+  }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ *            +---------+
+ *            |   p0    |
+ *     sA     |         |
+ *            |   p1    |
+ *            +---------+
+ *            |   p2    |
+ *     sB     |         |
+ *            |   p3    |
+ *            +---------+
+ *            |   p4    |
+ *     sC     |         |
+ *            |   p5    |
+ *            +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * sA + 1/4 * sB
+ *     p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ *     p0(upsampled) = sA
+ *     p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32
+     * bytes => no need to worry about buffer overflow when reading/writing
+     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
+     * details.
+     */
+    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+                                      vget_high_u8(sB), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+                                      vget_high_u8(sB), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel component values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            |         |         |
+ *            | p0   p1 | p2   p3 |
+ *            |         |         |
+ *            +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above.  To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ *     p0(upsampled) = p1(upsampled) = s0
+ *     p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr + 2 * colctr, output_pixels);
+    }
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            | p0   p1 | p2   p3 |
+ *       sA   |         |         |
+ *            | p4   p5 | p6   p7 |
+ *            +---------+---------+
+ *            | p8   p9 | p10  p11|
+ *       sB   |         |         |
+ *            | p12  p13| p14  p15|
+ *            +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above.  To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values for both output rows to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+    }
+  }
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jfdctfst-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.382683433 = 12544 * 2^-15
+ *    0.541196100 = 17795 * 2^-15
+ *    0.707106781 = 23168 * 2^-15
+ *    0.306562965 =  9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382  12544
+#define F_0_541  17792
+#define F_0_707  23168
+#define F_0_306  9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+  F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t data1 = vld4q_s16(data);
+  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  /* Load DCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  col4 = vsubq_s16(tmp10, tmp11);
+
+  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  col6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
+  int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  col3 = vsubq_s16(z13, z2);
+  col1 = vaddq_s16(z11, z4);
+  col7 = vsubq_s16(z11, z4);
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  row4 = vsubq_s16(tmp10, tmp11);
+
+  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  row6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
+  z13 = vsubq_s16(tmp7, z3);
+
+  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  row3 = vsubq_s16(z13, z2);
+  row1 = vaddq_s16(z11, z4);
+  row7 = vsubq_s16(z11, z4);
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jfdctint-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+  F_0_298, -F_0_390,  F_0_541,  F_0_765,
+ -F_0_899,  F_1_175,  F_1_501, -F_1_847,
+ -F_1_961,  F_2_053, -F_2_562,  F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+  /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t s_rows_0123 = vld4q_s16(data);
+  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  int32x4_t z1_l =
+    vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  int32x4_t z1_h =
+    vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t col2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t col2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+  int32x4_t col6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t col6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+  /* Odd part */
+  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t row2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t row2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+  int32x4_t row6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t row6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+  /* Odd part */
+  z1 = vaddq_s16(tmp4, tmp7);
+  z2 = vaddq_s16(tmp5, tmp6);
+  z3 = vaddq_s16(tmp4, tmp6);
+  z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jidctfst-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row4);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    int16x8_t dcval = row0;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+    row4 = dcval;
+    row5 = dcval;
+    row6 = dcval;
+    row7 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_low_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_high_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+    row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+    row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+    row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+    row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+    row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+    row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+    row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_high_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_low_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+    row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+    row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+    row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+    row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+    row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+    row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+    row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+  } else {
+    /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+    /* Load quantization table. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x8_t tmp0 = row0;
+    int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+    int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+    int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+    int16x8_t tmp10 = vaddq_s16(tmp0, tmp2);   /* phase 3 */
+    int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+    int16x8_t tmp13 = vaddq_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+    int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsubq_s16(tmp12, tmp13);
+
+    tmp0 = vaddq_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsubq_s16(tmp10, tmp13);
+    tmp1 = vaddq_s16(tmp11, tmp12);
+    tmp2 = vsubq_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+    int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+    int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+    int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+    int16x8_t z13 = vaddq_s16(tmp6, tmp5);     /* phase 6 */
+    int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+    int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+    int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+    tmp7 = vaddq_s16(z11, z13);                /* phase 5 */
+    int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+    tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+    int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+    int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+    z5 = vaddq_s16(z5, z10_add_z12);
+    tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+    tmp10 = vaddq_s16(tmp10, z12);
+    tmp10 = vsubq_s16(tmp10, z5);
+    tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+    tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+    tmp12 = vaddq_s16(tmp12, z5);
+
+    tmp6 = vsubq_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsubq_s16(tmp11, tmp6);
+    tmp4 = vaddq_s16(tmp10, tmp5);
+
+    row0 = vaddq_s16(tmp0, tmp7);
+    row7 = vsubq_s16(tmp0, tmp7);
+    row1 = vaddq_s16(tmp1, tmp6);
+    row6 = vsubq_s16(tmp1, tmp6);
+    row2 = vaddq_s16(tmp2, tmp5);
+    row5 = vsubq_s16(tmp2, tmp5);
+    row4 = vaddq_s16(tmp3, tmp4);
+    row3 = vsubq_s16(tmp3, tmp4);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+  int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+  int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+  int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+                                      vreinterpretq_s32_s16(rows_45.val[0]));
+  int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+                                      vreinterpretq_s32_s16(rows_45.val[1]));
+  int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+                                      vreinterpretq_s32_s16(rows_67.val[0]));
+  int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+                                      vreinterpretq_s32_s16(rows_67.val[1]));
+
+  int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+  int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+  int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+  int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+  int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+  int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+  int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+  int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+  int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+  int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+  int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+  int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(col0, col4);
+  int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+  int16x8_t tmp13 = vaddq_s16(col2, col6);
+  int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+  int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+  tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+  tmp12 = vsubq_s16(tmp12, tmp13);
+
+  int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+  int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+  int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+  int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+  /* Odd part */
+  int16x8_t z13 = vaddq_s16(col5, col3);
+  int16x8_t neg_z10 = vsubq_s16(col3, col5);
+  int16x8_t z11 = vaddq_s16(col1, col7);
+  int16x8_t z12 = vsubq_s16(col1, col7);
+
+  int16x8_t tmp7 = vaddq_s16(z11, z13);      /* phase 5 */
+  int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+  tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+  tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+  int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+  int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+  z5 = vaddq_s16(z5, z10_add_z12);
+  tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+  tmp10 = vaddq_s16(tmp10, z12);
+  tmp10 = vsubq_s16(tmp10, z5);
+  tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+  tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+  tmp12 = vaddq_s16(tmp12, z5);
+
+  int16x8_t tmp6 = vsubq_s16(tmp12, tmp7);   /* phase 2 */
+  int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+  int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+  col0 = vaddq_s16(tmp0, tmp7);
+  col7 = vsubq_s16(tmp0, tmp7);
+  col1 = vaddq_s16(tmp1, tmp6);
+  col6 = vsubq_s16(tmp1, tmp6);
+  col2 = vaddq_s16(tmp2, tmp5);
+  col5 = vsubq_s16(tmp2, tmp5);
+  col4 = vaddq_s16(tmp3, tmp4);
+  col3 = vsubq_s16(tmp3, tmp4);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col1, PASS1_BITS + 3));
+  int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col5, PASS1_BITS + 3));
+  int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col3, PASS1_BITS + 3));
+  int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col7, PASS1_BITS + 3));
+  /* Clamp to range [0-255]. */
+  uint8x16_t cols_01 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_45 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_23 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_67 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+  /* Transpose block to prepare for store. */
+  uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+                                     vreinterpretq_u32_u8(cols_45));
+  uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+                                     vreinterpretq_u32_u8(cols_67));
+
+  uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+                                    vreinterpretq_u8_u32(cols_0415.val[1]));
+  uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+                                    vreinterpretq_u8_u32(cols_2637.val[1]));
+  uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+                                     vreinterpretq_u16_u8(cols_2367.val[0]));
+  uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+                                     vreinterpretq_u16_u8(cols_2367.val[1]));
+
+  uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+  uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+  uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+  uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+  /* Store DCT block to memory. */
+  vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+  vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+  vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+  vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+  vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+  vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+  vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+  vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jidctint-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jidctint-neon.c
@@ -0,0 +1,801 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time.  Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+#define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+  F_0_899,             F_0_541,
+  F_2_562,             F_0_298_MINUS_0_899,
+  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+  F_0_541_PLUS_0_765,  F_1_175,
+  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+  0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients.  For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block.  This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients.  Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.)  Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ *      +-------+-------+                   +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   0   |   1   |                   |   0   |   2   |
+ *      |       |       |    transpose      |       |       |
+ *      +-------+-------+     ------>       +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   2   |   3   |                   |   1   |   3   |
+ *      |       |       |                   |       |       |
+ *      +-------+-------+                   +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right.  If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified.  On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
+ *    "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
+ *    result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
+ *    result is all zero.  For the left 4x8 half, this is handled identically
+ *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
+ *    the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block.  (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  int16_t workspace_l[8 * DCTSIZE / 2];
+  int16_t workspace_r[8 * DCTSIZE / 2];
+
+  /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+  /* Load DCT coefficients in left 4x8 block. */
+  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table for left 4x8 block. */
+  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+  int16x4_t bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  if (bitmap_rows_4567 == 0) {
+    bitmap = vorr_s16(bitmap, row3);
+    bitmap = vorr_s16(bitmap, row2);
+    bitmap = vorr_s16(bitmap, row1);
+    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (left_ac_bitmap == 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l, quadrant);
+      vst4_s16(workspace_r, quadrant);
+    } else {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l, workspace_r);
+    }
+  } else {
+    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                   row6, row7, quant_row0, quant_row1,
+                                   quant_row2, quant_row3, quant_row4,
+                                   quant_row5, quant_row6, quant_row7,
+                                   workspace_l, workspace_r);
+  }
+
+  /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+  /* Load DCT coefficients in right 4x8 block. */
+  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+  /* Load quantization table for right 4x8 block. */
+  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+  bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+  bitmap = vorr_s16(bitmap, row3);
+  bitmap = vorr_s16(bitmap, row2);
+  bitmap = vorr_s16(bitmap, row1);
+  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  /* If this remains non-zero, a "regular" second pass will be performed. */
+  int64_t right_ac_dc_bitmap = 1;
+
+  if (right_ac_bitmap == 0) {
+    bitmap = vorr_s16(bitmap, row0);
+    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (right_ac_dc_bitmap != 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+    }
+  } else {
+    if (bitmap_rows_4567 == 0) {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l + 4 * DCTSIZE / 2,
+                                    workspace_r + 4 * DCTSIZE / 2);
+    } else {
+      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                     row6, row7, quant_row0, quant_row1,
+                                     quant_row2, quant_row3, quant_row4,
+                                     quant_row5, quant_row6, quant_row7,
+                                     workspace_l + 4 * DCTSIZE / 2,
+                                     workspace_r + 4 * DCTSIZE / 2);
+    }
+  }
+
+  /* Second pass: compute IDCT on rows in workspace. */
+
+  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+  if (right_ac_dc_bitmap == 0) {
+    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+  } else {
+    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+  }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.  (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c.  Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+  int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  z3_s16 = vmul_s16(row4, quant_row4);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+  int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0.  This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients.  (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c.  Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+  int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+  int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0.  This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jidctred-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+  /* Dequantize DCT coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+  row1 = vmulq_s16(row1, quant_row1);
+  row3 = vmulq_s16(row3, quant_row3);
+  row5 = vmulq_s16(row5, quant_row5);
+  row7 = vmulq_s16(row7, quant_row7);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+  /* Odd part */
+  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+  /* Transpose two rows, ready for second pass. */
+  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+  int16x8_t cols_0246 = cols_0246_1357.val[0];
+  int16x8_t cols_1357 = cols_0246_1357.val[1];
+  /* Duplicate columns such that each is accessible in its own vector. */
+  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+                                         vreinterpretq_s32_s16(cols_1357));
+  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+                                      vsubhn_s32(tmp10, tmp0));
+  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+  /* Narrow to 8-bit and convert to unsigned. */
+  uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+  /* Store 2x2 block to memory. */
+  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+     */
+    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+    row0 = dcval;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+  } else {
+    /* All AC coefficients are non-zero; full IDCT calculation required. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x8_t z2 = vmulq_s16(row2, quant_row2);
+    int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+    /* Odd part */
+    int16x8_t z1 = vmulq_s16(row7, quant_row7);
+    z2 = vmulq_s16(row5, quant_row5);
+    z3 = vmulq_s16(row3, quant_row3);
+    int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+                                    vreinterpretq_s32_s16(row_23.val[0]));
+  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+                                    vreinterpretq_s32_s16(row_23.val[1]));
+
+  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+  /* Odd part */
+  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+                                          vsubhn_s32(tmp12, tmp0));
+  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+                                          vsubhn_s32(tmp10, tmp2));
+  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+   * An interleaving store completes the transpose.
+   */
+  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+                                    vqmovun_s16(output_cols_13));
+  uint16x4x2_t output_01_23 = { {
+    vreinterpret_u16_u8(output_0123.val[0]),
+    vreinterpret_u16_u8(output_0123.val[1])
+  } };
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
--- a/android/extern/libjpeg-turbo/simd/arm/jquanti-neon.c
+++ b/android/extern/libjpeg-turbo/simd/arm/jquanti-neon.c
@@ -0,0 +1,193 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+                         DCTELEM *workspace)
+{
+  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+  int16x8_t row0 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row1 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row2 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row3 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row4 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row5 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row6 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row7 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+  vst1q_s16(workspace + 0 * DCTSIZE, row0);
+  vst1q_s16(workspace + 1 * DCTSIZE, row1);
+  vst1q_s16(workspace + 2 * DCTSIZE, row2);
+  vst1q_s16(workspace + 3 * DCTSIZE, row3);
+  vst1q_s16(workspace + 4 * DCTSIZE, row4);
+  vst1q_s16(workspace + 5 * DCTSIZE, row5);
+  vst1q_s16(workspace + 6 * DCTSIZE, row6);
+  vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+                         DCTELEM *workspace)
+{
+  JCOEFPTR out_ptr = coef_block;
+  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+  int i;
+
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
+  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+    /* Load DCT coefficients. */
+    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+    /* Load reciprocals of quantization values. */
+    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+    /* Extract sign from coefficients. */
+    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+    /* Get absolute value of DCT coefficients. */
+    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+    /* Add correction. */
+    abs_row0 = vaddq_u16(abs_row0, corr0);
+    abs_row1 = vaddq_u16(abs_row1, corr1);
+    abs_row2 = vaddq_u16(abs_row2, corr2);
+    abs_row3 = vaddq_u16(abs_row3, corr3);
+
+    /* Multiply DCT coefficients by quantization reciprocals. */
+    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+                                                       vget_low_u16(recip0)));
+    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+                                                       vget_high_u16(recip0)));
+    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+                                                       vget_low_u16(recip1)));
+    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+                                                       vget_high_u16(recip1)));
+    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+                                                       vget_low_u16(recip2)));
+    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+                                                       vget_high_u16(recip2)));
+    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+                                                       vget_low_u16(recip3)));
+    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+                                                       vget_high_u16(recip3)));
+    /* Narrow back to 16-bit. */
+    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+    /* Since VSHR only supports an immediate as its second argument, negate the
+     * shift value and shift left.
+     */
+    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+                                           vnegq_s16(shift0)));
+    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+                                           vnegq_s16(shift1)));
+    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+                                           vnegq_s16(shift2)));
+    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+                                           vnegq_s16(shift3)));
+
+    /* Restore sign to original product. */
+    row0 = veorq_s16(row0, sign_row0);
+    row0 = vsubq_s16(row0, sign_row0);
+    row1 = veorq_s16(row1, sign_row1);
+    row1 = vsubq_s16(row1, sign_row1);
+    row2 = veorq_s16(row2, sign_row2);
+    row2 = vsubq_s16(row2, sign_row2);
+    row3 = veorq_s16(row3, sign_row3);
+    row3 = vsubq_s16(row3, sign_row3);
+
+    /* Store quantized coefficients to memory. */
+    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+  }
+}
--- a/android/extern/libjpeg-turbo/simd/arm/neon-compat.h.in
+++ b/android/extern/libjpeg-turbo/simd/arm/neon-compat.h.in
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#cmakedefine HAVE_VLD1_S16_X3
+#cmakedefine HAVE_VLD1_U16_X2
+#cmakedefine HAVE_VLD1Q_U8_X4
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x)  __builtin_clz(x)
+#define BUILTIN_CLZLL(x)  __builtin_clzll(x)
+#define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif