Skip to content

Latest commit

 

History

History
1671 lines (1579 loc) · 56.5 KB

godbolt_source_vertpass.md

File metadata and controls

1671 lines (1579 loc) · 56.5 KB

Diff template

  • Source 1
#include <immintrin.h>
#include <cassert>
#include <cstring>


#define C10_RESTRICT __restrict
#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))

using uint8_t = unsigned char;
using uint32_t = unsigned int;


static __m128i inline mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr) {
  return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr));
}


void ImagingResampleVerticalConvolution8u(
    uint8_t* C10_RESTRICT lineOut,
    const uint8_t* C10_RESTRICT lineIn,
    int64_t xsize,
    int64_t ids_min,
    int64_t ids_size,
    const int16_t* k,
    unsigned int coefs_precision,
    int64_t num_channels) {

  // Interpolation vertical pass processing one line.
  // - We process x-axis data with blocks of 8, 2 and 1
  // - We split the size of weight vector for a given output index as a sum: K = n * 2 + m.

  // xsize = output width, also equals to input width
  // ids_size = interpolation size
  // ids_min = input y start index
  const auto stride = num_channels * 1;  // num channels * sizeof(uint8)

//   TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4);
  assert(stride == 3 || stride == 4);

  const int64_t data_size = xsize * stride;
  const int64_t data_stride = stride;
  constexpr auto vec_size = 256 / 8;

  const auto initial = _mm_set1_epi32(1 << (coefs_precision - 1));
  const auto initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1));
  const auto zero = _mm_setzero_si128();
  const auto zero_256 = _mm256_setzero_si256();

  int64_t j = 0;
  // block 8
  const auto b8_usable_vec_stride = (vec_size / data_stride) * data_stride;
  for (; j < data_size - vec_size; j += b8_usable_vec_stride) {
    auto sss0 = initial_256;
    auto sss1 = initial_256;
    auto sss2 = initial_256;
    auto sss3 = initial_256;
    int64_t i = 0;
    const auto * lineIn_min = lineIn + j + ids_min;

    for (; i < ids_size - 1; i += 2) {
      // Load 2 values from weight vector
      auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]);

      // RGBA: Load 8 pixels per line
      // source1 = [
      //    r0 g0 b0 a0  r1 g1 b1 a1  r2 g2 b2 a2  r3 g3 b3 a3
      //    r4 g4 b4 a4  r5 g5 b5 a5  r6 g6 b6 a6  r7 g7 b7 a7
      // ]
      // RGB: Load 8 pixels per line (however we can process only 8 pixels):
      // source1 = [
      //    r0 g0 b0 r1  g1 b1 r2 g2  b2 r3 g3 b3  r4 g4 b4 r5
      //    r4 g4 b4 r5  g5 b5 r6 g6  b6 r7 g7 b7  r8 g8 b8 r9
      // ]
      auto source1 =
          _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * i));
      auto source2 =
          _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * (i + 1)));

      // Interleave source1 and source2 from the low half of each 128-bit lane
      // and cast the result to epi16
      // RGBA: pix1 = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  a1 0 A1 0
      // ]
      // RGB: pix1 = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  0 0 0 0
      // ]
      auto source_lo = _mm256_unpacklo_epi8(source1, source2);
      auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256);
      // Compute output value as
      //   C += w0 * c0 + w1 * C0
      //   C += w0 * c1 + w1 * C1 for each channel in 32-bit precision
      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));

      // RGBA: pix2 = [
      //    r2 0 R2 0  g2 0 G2 0  b2 0 B2 0  a2 0 A2 0
      //    r3 0 R3 0  g3 0 G3 0  b3 0 B3 0  a3 0 A3 0
      // ]
      // RGB: pix2 = [
      //    r2 0 R2 0  g2 0 G2 0  b2 0 B2 0  0 0 0 0
      //    r3 0 R3 0  g3 0 G3 0  b3 0 B3 0  0 0 0 0
      // ]
      auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256);
      // Compute output value as
      //   C += w0 * c2 + w1 * C2
      //   C += w0 * c3 + w1 * C3 for each channel in 32-bit precision
      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));

      // Same as above for the high half of each 128-bit lane
      auto source_hi = _mm256_unpackhi_epi8(source1, source2);
      auto pix3 = _mm256_unpacklo_epi8(source_hi, zero_256);
      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk));
      auto pix4 = _mm256_unpackhi_epi8(source_hi, zero_256);
      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk));
    }
    // Same processing as above but with a single weight value
    for (; i < ids_size; i += 1) {
      auto mmk = _mm256_set1_epi32(k[i]);

      auto source1 = _mm256_loadu_si256((__m256i*)(lineIn_min + i * data_size));

      auto source_lo = _mm256_unpacklo_epi8(source1, zero_256);
      auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256);
      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));
      auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256);
      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));

      auto source_hi = _mm256_unpackhi_epi8(source1, zero_256);
      auto pix3 = _mm256_unpacklo_epi8(source_hi, _mm256_setzero_si256());
      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk));
      auto pix4 = _mm256_unpackhi_epi8(source_hi, _mm256_setzero_si256());
      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk));
    }
    // Shift back the output integer values: output = output >> weights_precision
    sss0 = _mm256_srai_epi32(sss0, coefs_precision);
    sss1 = _mm256_srai_epi32(sss1, coefs_precision);
    sss2 = _mm256_srai_epi32(sss2, coefs_precision);
    sss3 = _mm256_srai_epi32(sss3, coefs_precision);
    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
    // (a a a a b b b b | c c c c d d d d) -> (a a b b c c d d)
    sss0 = _mm256_packs_epi32(sss0, sss1);
    sss2 = _mm256_packs_epi32(sss2, sss3);
    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
    // (a a b b c c d d) -> (a b c d)
    sss0 = _mm256_packus_epi16(sss0, sss2);

    // Stores 32 bytes
    _mm256_storeu_si256((__m256i*)(lineOut + j), sss0);
  }

  // TODO: Do we also need block 4 ???
  // block 2
  const auto b2_usable_vec_stride = (8 / data_stride) * data_stride;
  for (; j < data_size - vec_size / 4; j += b2_usable_vec_stride) {
    auto sss0 = initial;
    auto sss1 = initial;
    int64_t i = 0;
    const auto * lineIn_min = lineIn + j + ids_min;

    for (; i < ids_size - 1; i += 2) {
      // Load 2 values from weight vector
      // mmk = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ... ]
      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);

      // Load 2 pixels per line
      // RGBA: source1 = [
      //    r0 g0 b0 a0  r1 g1 b1 a1  0 0 0 0  0 0 0 0
      // ]
      // RGB: source1 = [
      //    r0 g0 b0 r1  g1 b1 r2 g2  0 0 0 0  0 0 0 0
      // ]
      auto source1 = _mm_loadl_epi64((__m128i *) (lineIn_min + i * data_size));
      auto source2 = _mm_loadl_epi64((__m128i *) (lineIn_min + (i + 1) * data_size));
      // Interleave source1 and source2 and cast the result to epi16
      // RGBA: pix = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
      // ]
      // RGB: pix = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
      // ]
      auto source = _mm_unpacklo_epi8(source1, source2);
      auto pix = _mm_unpacklo_epi8(source, zero);
      // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision
      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
      // RGBA: pix = [
      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  a1 0 A1 0
      // ]
      // RGB: pix = [
      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  0 0 0 0
      // ]
      pix = _mm_unpackhi_epi8(source, zero);
      // Compute output value as C += w0 * c1 + w1 * C1 for each channel in 32-bit precision
      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
    }
    // Same processing as above but with a single weight value
    for (; i < ids_size; i += 1) {
      auto mmk = _mm_set1_epi32(k[i]);

      auto source1 = _mm_loadl_epi64((__m128i*) (lineIn_min + i * data_size));

      auto source = _mm_unpacklo_epi8(source1, zero);
      auto pix1 = _mm_unpacklo_epi8(source, zero);
      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix1, mmk));
      auto pix2 = _mm_unpackhi_epi8(source, zero);
      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix2, mmk));
    }
    // Shift back the output integer values: output = output >> weights_precision
    sss0 = _mm_srai_epi32(sss0, coefs_precision);
    sss1 = _mm_srai_epi32(sss1, coefs_precision);
    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
    // (a a a a b b b b | c c c c d d d d) -> (a a b b c c d d)
    sss0 = _mm_packs_epi32(sss0, sss1);
    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
    // (a a b b c c d d) -> (a b c d)
    sss0 = _mm_packus_epi16(sss0, sss0);
    // Store 2 pixels to the output
    _mm_storel_epi64((__m128i*)(lineOut + j), sss0);
  }

  // block 1
  const auto b1_usable_vec_stride = (4 / data_stride) * data_stride;
  for (; j < data_size - 4; j += b1_usable_vec_stride) {
    auto sss = initial;
    int64_t i = 0;
    const auto * lineIn_min = lineIn + j + ids_min;

    for (; i < ids_size - 1; i += 2) {
      // Load 2 values from weight vector
      // mmk = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ... ]
      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);

      // Load one pixel per line
      // RGBA: source1 = [
      //    r0 g0 b0 a0  0 0 0 0  0 0 0 0  0 0 0 0
      // ]
      // RGB: source1 = [
      //    r0 g0 b0 r1  0 0 0 0  0 0 0 0  0 0 0 0
      // ]
      auto source1 = _mm_cvtsi32_si128(*(int32_t*)(lineIn_min + i * data_size));
      auto source2 = _mm_cvtsi32_si128(*(int32_t*)(lineIn_min + (i + 1) * data_size));

      // Interleave source1 and source2 and cast the result to epi16
      // RGBA: pix = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
      // ]
      // RGB: pix = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
      // ]
      auto source = _mm_unpacklo_epi8(source1, source2);
      auto pix = _mm_unpacklo_epi8(source, zero);
      // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision
      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
    }

    for (; i < ids_size; i++) {
      auto mmk = _mm_set1_epi32(k[i]);
      auto pix = mm_cvtepu8_epi32(lineIn_min + i * data_size);
      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
    }
    sss = _mm_srai_epi32(sss, coefs_precision);
    sss = _mm_packs_epi32(sss, zero);
    sss = _mm_packus_epi16(sss, zero);

    auto o = _mm_cvtsi128_si32(sss);

    // Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
    // It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
    // We also wont go out of bounds of lineOut memory allocation
    std::memcpy(lineOut + j, (uint8_t *) &o, 4);
  }

  for (; j < data_size; j += data_stride) {
    auto sss = initial;
    int64_t i = 0;
    const auto * lineIn_min = lineIn + j + ids_min;
    // For RGBA we can use (ids_size - 1) as tighter limit but for RGB we can read outside memory boundary
    // for the last remaining line
    for (; i < ids_size - 2; i += 2) {
      // Load two coefficients at once
      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);

      // Load 2 lines
      auto source1 = _mm_cvtsi32_si128(*(int32_t*)(lineIn_min + i * data_size));
      auto source2 = _mm_cvtsi32_si128(*(int32_t*)(lineIn_min + (i + 1) * data_size));

      auto source = _mm_unpacklo_epi8(source1, source2);
      auto pix = _mm_unpacklo_epi8(source, zero);
      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
    }

    // Same processing as above but with a single weight value
    for (; i < ids_size; i++) {
      auto mmk = _mm_set1_epi32(k[i]);

      const uint8_t * p = lineIn_min + i * data_size;
      __m128i pix;
      // There is no much perf gain using more detailed condition like
      // num_channels == 3 && ids_min + j + data_size * i + 4 >= in_max_size
      // const int64_t in_max_size = data_size * in_ysize;
      if (num_channels == 3) {
        uint8_t input[4];
        std::memcpy(input, p, 3);
        pix = mm_cvtepu8_epi32(input);
      } else {
        pix = mm_cvtepu8_epi32(p);
      }
      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
    }

    // Shift back the output integer values: output = output >> weights_precision
    sss = _mm_srai_epi32(sss, coefs_precision);
    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
    // (a a a a b b b b | c c c c d d d d) -> (a a b b c c d d)
    sss = _mm_packs_epi32(sss, zero);
    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
    // (a a b b c c d d) -> (a b c d)
    sss = _mm_packus_epi16(sss, zero);
    // Store one pixel to the output
    auto o = _mm_cvtsi128_si32(sss);
    if (num_channels == 3 && C10_UNLIKELY(j + 4 >= data_size)) {
      std::memcpy(lineOut + j, (uint8_t *) &o, 3);
    } else {
      std::memcpy(lineOut + j, (uint8_t *) &o, 4);
    }
  }
}


void foo(
    uint8_t* C10_RESTRICT lineOut,
    const uint8_t* C10_RESTRICT lineIn,
    int64_t xsize,
    int64_t ids_min,
    int64_t ids_size,
    const int16_t* k,
    unsigned int coefs_precision)
{
    constexpr int num_channels = 3;

    ImagingResampleVerticalConvolution8u(
        lineOut,
        lineIn,
        xsize,
        ids_min,
        ids_size,
        k,
        coefs_precision,
        num_channels);
}
  • Source 2
#include <immintrin.h>
#include <cassert>
#include <cstring>


#define C10_RESTRICT __restrict
#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))

using uint8_t = unsigned char;
using uint32_t = unsigned int;


static __m128i inline mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr) {
  return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr));
}


void ImagingResampleVerticalConvolution8u(
    uint8_t* C10_RESTRICT lineOut,
    const uint8_t* C10_RESTRICT lineIn,
    int64_t xsize,
    int64_t ids_min,
    int64_t ids_size,
    const int16_t* k,
    unsigned int coefs_precision,
    int64_t num_channels) {

  // Interpolation vertical pass processing one line.
  // - We process x-axis data with blocks of 8, 2 and 1
  // - We split the size of weight vector for a given output index as a sum: K = n * 2 + m.

  // xsize = output width, also equals to input width
  // ids_size = interpolation size
  // ids_min = input y start index
  const auto stride = num_channels * 1;  // num channels * sizeof(uint8)

//   TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4);
  assert(stride == 3 || stride == 4);

  const int64_t data_size = xsize * stride;
  const int64_t data_stride = stride;
  constexpr auto vec_size = 256 / 8;

  const auto initial = _mm_set1_epi32(1 << (coefs_precision - 1));
  const auto initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1));
  const auto zero = _mm_setzero_si128();
  const auto zero_256 = _mm256_setzero_si256();

  int64_t j = 0;
  // block 8
  const auto b8_usable_vec_stride = (vec_size / data_stride) * data_stride;
  for (; j < data_size - vec_size; j += b8_usable_vec_stride) {
    auto sss0 = initial_256;
    auto sss1 = initial_256;
    auto sss2 = initial_256;
    auto sss3 = initial_256;
    int64_t i = 0;
    const auto * lineIn_min = lineIn + j + ids_min;

    for (; i < ids_size - 1; i += 2) {
      // Load 2 values from weight vector
      auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]);

      // RGBA: Load 8 pixels per line
      // source1 = [
      //    r0 g0 b0 a0  r1 g1 b1 a1  r2 g2 b2 a2  r3 g3 b3 a3
      //    r4 g4 b4 a4  r5 g5 b5 a5  r6 g6 b6 a6  r7 g7 b7 a7
      // ]
      // RGB: Load 8 pixels per line (however we can process only 8 pixels):
      // source1 = [
      //    r0 g0 b0 r1  g1 b1 r2 g2  b2 r3 g3 b3  r4 g4 b4 r5
      //    r4 g4 b4 r5  g5 b5 r6 g6  b6 r7 g7 b7  r8 g8 b8 r9
      // ]
      auto source1 =
          _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * i));
      auto source2 =
          _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * (i + 1)));

      // Interleave source1 and source2 from the low half of each 128-bit lane
      // and cast the result to epi16
      // RGBA: pix1 = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  a1 0 A1 0
      // ]
      // RGB: pix1 = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  0 0 0 0
      // ]
      auto source_lo = _mm256_unpacklo_epi8(source1, source2);
      auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256);
      // Compute output value as
      //   C += w0 * c0 + w1 * C0
      //   C += w0 * c1 + w1 * C1 for each channel in 32-bit precision
      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));

      // RGBA: pix2 = [
      //    r2 0 R2 0  g2 0 G2 0  b2 0 B2 0  a2 0 A2 0
      //    r3 0 R3 0  g3 0 G3 0  b3 0 B3 0  a3 0 A3 0
      // ]
      // RGB: pix2 = [
      //    r2 0 R2 0  g2 0 G2 0  b2 0 B2 0  0 0 0 0
      //    r3 0 R3 0  g3 0 G3 0  b3 0 B3 0  0 0 0 0
      // ]
      auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256);
      // Compute output value as
      //   C += w0 * c2 + w1 * C2
      //   C += w0 * c3 + w1 * C3 for each channel in 32-bit precision
      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));

      // Same as above for the high half of each 128-bit lane
      auto source_hi = _mm256_unpackhi_epi8(source1, source2);
      auto pix3 = _mm256_unpacklo_epi8(source_hi, zero_256);
      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk));
      auto pix4 = _mm256_unpackhi_epi8(source_hi, zero_256);
      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk));
    }
    // Same processing as above but with a single weight value
    for (; i < ids_size; i += 1) {
      auto mmk = _mm256_set1_epi32(k[i]);

      auto source1 = _mm256_loadu_si256((__m256i*)(lineIn_min + i * data_size));

      auto source_lo = _mm256_unpacklo_epi8(source1, zero_256);
      auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256);
      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));
      auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256);
      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));

      auto source_hi = _mm256_unpackhi_epi8(source1, zero_256);
      auto pix3 = _mm256_unpacklo_epi8(source_hi, _mm256_setzero_si256());
      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk));
      auto pix4 = _mm256_unpackhi_epi8(source_hi, _mm256_setzero_si256());
      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk));
    }
    // Shift back the output integer values: output = output >> weights_precision
    sss0 = _mm256_srai_epi32(sss0, coefs_precision);
    sss1 = _mm256_srai_epi32(sss1, coefs_precision);
    sss2 = _mm256_srai_epi32(sss2, coefs_precision);
    sss3 = _mm256_srai_epi32(sss3, coefs_precision);
    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
    // (a a a a b b b b | c c c c d d d d) -> (a a b b c c d d)
    sss0 = _mm256_packs_epi32(sss0, sss1);
    sss2 = _mm256_packs_epi32(sss2, sss3);
    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
    // (a a b b c c d d) -> (a b c d)
    sss0 = _mm256_packus_epi16(sss0, sss2);

    // Stores 32 bytes
    _mm256_storeu_si256((__m256i*)(lineOut + j), sss0);
  }

  // TODO: Do we also need block 4 ???
  // block 2
  const auto b2_usable_vec_stride = (8 / data_stride) * data_stride;
  for (; j < data_size - vec_size / 4; j += b2_usable_vec_stride) {
    auto sss0 = initial;
    auto sss1 = initial;
    int64_t i = 0;
    const auto * lineIn_min = lineIn + j + ids_min;

    for (; i < ids_size - 1; i += 2) {
      // Load 2 values from weight vector
      // mmk = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ... ]
      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);

      // Load 2 pixels per line
      // RGBA: source1 = [
      //    r0 g0 b0 a0  r1 g1 b1 a1  0 0 0 0  0 0 0 0
      // ]
      // RGB: source1 = [
      //    r0 g0 b0 r1  g1 b1 r2 g2  0 0 0 0  0 0 0 0
      // ]
      auto source1 = _mm_loadl_epi64((__m128i *) (lineIn_min + i * data_size));
      auto source2 = _mm_loadl_epi64((__m128i *) (lineIn_min + (i + 1) * data_size));
      // Interleave source1 and source2 and cast the result to epi16
      // RGBA: pix = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
      // ]
      // RGB: pix = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
      // ]
      auto source = _mm_unpacklo_epi8(source1, source2);
      auto pix = _mm_unpacklo_epi8(source, zero);
      // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision
      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
      // RGBA: pix = [
      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  a1 0 A1 0
      // ]
      // RGB: pix = [
      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  0 0 0 0
      // ]
      pix = _mm_unpackhi_epi8(source, zero);
      // Compute output value as C += w0 * c1 + w1 * C1 for each channel in 32-bit precision
      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
    }
    // Same processing as above but with a single weight value
    for (; i < ids_size; i += 1) {
      auto mmk = _mm_set1_epi32(k[i]);

      auto source1 = _mm_loadl_epi64((__m128i*) (lineIn_min + i * data_size));

      auto source = _mm_unpacklo_epi8(source1, zero);
      auto pix1 = _mm_unpacklo_epi8(source, zero);
      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix1, mmk));
      auto pix2 = _mm_unpackhi_epi8(source, zero);
      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix2, mmk));
    }
    // Shift back the output integer values: output = output >> weights_precision
    sss0 = _mm_srai_epi32(sss0, coefs_precision);
    sss1 = _mm_srai_epi32(sss1, coefs_precision);
    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
    // (a a a a b b b b | c c c c d d d d) -> (a a b b c c d d)
    sss0 = _mm_packs_epi32(sss0, sss1);
    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
    // (a a b b c c d d) -> (a b c d)
    sss0 = _mm_packus_epi16(sss0, sss0);
    // Store 2 pixels to the output
    _mm_storel_epi64((__m128i*)(lineOut + j), sss0);
  }

  // block 1
  const auto b1_usable_vec_stride = (4 / data_stride) * data_stride;
  for (; j < data_size - 4; j += b1_usable_vec_stride) {
    auto sss = initial;
    int64_t i = 0;
    const auto * lineIn_min = lineIn + j + ids_min;

    for (; i < ids_size - 1; i += 2) {
      // Load 2 values from weight vector
      // mmk = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ... ]
      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);

      // Load one pixel per line
      // RGBA: source1 = [
      //    r0 g0 b0 a0  0 0 0 0  0 0 0 0  0 0 0 0
      // ]
      // RGB: source1 = [
      //    r0 g0 b0 r1  0 0 0 0  0 0 0 0  0 0 0 0
      // ]
      auto source1 = _mm_cvtsi32_si128(*(int32_t*)(lineIn_min + i * data_size));
      auto source2 = _mm_cvtsi32_si128(*(int32_t*)(lineIn_min + (i + 1) * data_size));

      // Interleave source1 and source2 and cast the result to epi16
      // RGBA: pix = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
      // ]
      // RGB: pix = [
      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
      // ]
      auto source = _mm_unpacklo_epi8(source1, source2);
      auto pix = _mm_unpacklo_epi8(source, zero);
      // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision
      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
    }

    for (; i < ids_size; i++) {
      auto mmk = _mm_set1_epi32(k[i]);
      auto pix = mm_cvtepu8_epi32(lineIn_min + i * data_size);
      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
    }
    sss = _mm_srai_epi32(sss, coefs_precision);
    sss = _mm_packs_epi32(sss, zero);
    sss = _mm_packus_epi16(sss, zero);

    auto o = _mm_cvtsi128_si32(sss);

    // Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
    // It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
    // We also wont go out of bounds of lineOut memory allocation
    std::memcpy(lineOut + j, (uint8_t *) &o, 4);
  }

  for (; j < data_size; j += data_stride) {
    auto sss = initial;
    int64_t i = 0;
    const auto * lineIn_min = lineIn + j + ids_min;
    // For RGBA we can use (ids_size - 1) as tighter limit but for RGB we can read outside memory boundary
    // for the last remaining line
    for (; i < ids_size - 2; i += 2) {
      // Load two coefficients at once
      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);

      // Load 2 lines
      auto source1 = _mm_cvtsi32_si128(*(int32_t*)(lineIn_min + i * data_size));
      auto source2 = _mm_cvtsi32_si128(*(int32_t*)(lineIn_min + (i + 1) * data_size));

      auto source = _mm_unpacklo_epi8(source1, source2);
      auto pix = _mm_unpacklo_epi8(source, zero);
      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
    }

    // Same processing as above but with a single weight value
    for (; i < ids_size; i++) {
      auto mmk = _mm_set1_epi32(k[i]);

      const uint8_t * p = lineIn_min + i * data_size;
      __m128i pix;
      // There is no much perf gain using more detailed condition like
      // num_channels == 3 && ids_min + j + data_size * i + 4 >= in_max_size
      // const int64_t in_max_size = data_size * in_ysize;
      if (num_channels == 3) {
        uint8_t input[4];
        std::memcpy(input, p, 3);
        pix = mm_cvtepu8_epi32(input);
      } else {
        pix = mm_cvtepu8_epi32(p);
      }
      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
    }

    // Shift back the output integer values: output = output >> weights_precision
    sss = _mm_srai_epi32(sss, coefs_precision);
    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
    // (a a a a b b b b | c c c c d d d d) -> (a a b b c c d d)
    sss = _mm_packs_epi32(sss, zero);
    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
    // (a a b b c c d d) -> (a b c d)
    sss = _mm_packus_epi16(sss, zero);
    // Store one pixel to the output
    auto o = _mm_cvtsi128_si32(sss);
    if (num_channels == 3 && C10_UNLIKELY(j + 4 >= data_size)) {
      std::memcpy(lineOut + j, (uint8_t *) &o, 3);
    } else {
      std::memcpy(lineOut + j, (uint8_t *) &o, 4);
    }
  }
}


void foo(
    uint8_t* C10_RESTRICT lineOut,
    const uint8_t* C10_RESTRICT lineIn,
    int64_t xsize,
    int64_t ids_min,
    int64_t ids_size,
    const int16_t* k,
    unsigned int coefs_precision)
{
    int num_channels = 3;

    ImagingResampleVerticalConvolution8u(
        lineOut,
        lineIn,
        xsize,
        ids_min,
        ids_size,
        k,
        coefs_precision,
        num_channels);
}
  • Assembly for source 1
    • compiler: x86-64 GCC 9.4
    • Flags: -std=c++17 -O3 -mavx -mfma -mavx2 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -Wno-deprecated -fvisibility-inlines-hidden -fopenmp -O3 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow
.LC0:
        .string "void ImagingResampleVerticalConvolution8u(uint8_t*, const uint8_t*, int64_t, int64_t, int64_t, const int16_t*, unsigned int, int64_t)"
.LC1:
        .string "/app/example.cpp"
.LC2:
        .string "stride == 3 || stride == 4"
ImagingResampleVerticalConvolution8u(unsigned char*, unsigned char const*, long, long, long, short const*, unsigned int, long):
        push    rbp
        mov     rbp, rsp
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        and     rsp, -32
        sub     rsp, 64
        mov     r10, QWORD PTR 24[rbp]
        mov     QWORD PTR 32[rsp], rsi
        lea     rax, -3[r10]
        mov     QWORD PTR 24[rsp], rcx
        cmp     rax, 1
        ja      .L62
        mov     eax, DWORD PTR 16[rbp]
        mov     rsi, rdx
        mov     r14, rdi
        imul    rsi, r10
        lea     ecx, -1[rax]
        mov     eax, 1
        sal     eax, cl
        vmovd   xmm2, eax
        vmovd   xmm10, eax
        mov     eax, 32
        lea     r15, -32[rsi]
        vpshufd xmm2, xmm2, 0
        vpbroadcastd    ymm10, xmm10
        cqo
        idiv    r10
        imul    rax, r10
        test    r15, r15
        jle     .L39
        lea     rdx, -2[r8]
        mov     QWORD PTR 24[rbp], r10
        lea     rdi, [rsi+rsi]
        vpxor   xmm5, xmm5, xmm5
        shr     rdx
        vmovd   xmm1, DWORD PTR 16[rbp]
        mov     r12, QWORD PTR 32[rsp]
        lea     rcx, 4[r9+rdx*4]
        lea     rdx, 2[rdx+rdx]
        add     r12, QWORD PTR 24[rsp]
        mov     QWORD PTR 40[rsp], rdx
        xor     edx, edx
.L9:
        cmp     r8, 1
        jle     .L63
        mov     r11, r12
        vmovdqa ymm8, ymm10
        vmovdqa ymm7, ymm10
        mov     r10, r9
        vmovdqa ymm9, ymm10
        vmovdqa ymm6, ymm10
.L4:
        vmovdqu ymm3, YMMWORD PTR [r11]
        vpunpcklbw      ymm0, ymm3, YMMWORD PTR [r11+rsi]
        add     r10, 4
        vpbroadcastd    ymm3, DWORD PTR -4[r10]
        vpunpcklbw      ymm4, ymm0, ymm5
        vpunpckhbw      ymm0, ymm0, ymm5
        vpmaddwd        ymm4, ymm4, ymm3
        vpmaddwd        ymm0, ymm0, ymm3
        vpaddd  ymm6, ymm4, ymm6
        vpaddd  ymm9, ymm0, ymm9
        vmovdqu ymm4, YMMWORD PTR [r11]
        vpunpckhbw      ymm0, ymm4, YMMWORD PTR [r11+rsi]
        add     r11, rdi
        vpunpcklbw      ymm4, ymm0, ymm5
        vpunpckhbw      ymm0, ymm0, ymm5
        vpmaddwd        ymm4, ymm4, ymm3
        vpmaddwd        ymm0, ymm0, ymm3
        vpaddd  ymm7, ymm4, ymm7
        vpaddd  ymm8, ymm0, ymm8
        cmp     r10, rcx
        jne     .L4
        mov     r11, QWORD PTR 40[rsp]
.L7:
        cmp     r8, r11
        jle     .L5
        mov     rbx, rsi
        movsx   r10d, WORD PTR [r9+r11*2]
        lea     r13, [r11+r11]
        imul    rbx, r11
        add     r11, 1
        vmovd   xmm11, r10d
        vpbroadcastd    ymm11, xmm11
        vmovdqu ymm3, YMMWORD PTR [r12+rbx]
        vmovdqu ymm0, YMMWORD PTR [r12+rbx]
        vpunpcklbw      ymm3, ymm3, ymm5
        vpunpckhbw      ymm0, ymm0, ymm5
        vpunpcklbw      ymm4, ymm3, ymm5
        vpunpcklbw      ymm12, ymm0, ymm5
        vpunpckhbw      ymm3, ymm3, ymm5
        vpunpckhbw      ymm0, ymm0, ymm5
        vpmaddwd        ymm4, ymm4, ymm11
        vpmaddwd        ymm3, ymm3, ymm11
        vpmaddwd        ymm12, ymm12, ymm11
        vpmaddwd        ymm0, ymm0, ymm11
        vpaddd  ymm4, ymm4, ymm6
        vpaddd  ymm3, ymm3, ymm9
        vpaddd  ymm12, ymm12, ymm7
        vpaddd  ymm0, ymm0, ymm8
        vmovdqa ymm6, ymm4
        vmovdqa ymm9, ymm3
        vmovdqa ymm7, ymm12
        vmovdqa ymm8, ymm0
        cmp     r8, r11
        jle     .L5
        lea     r11, [r12+rsi]
        movsx   r10d, WORD PTR 2[r9+r13]
        vmovdqu ymm8, YMMWORD PTR [r11+rbx]
        vmovd   xmm11, r10d
        vpunpcklbw      ymm9, ymm8, ymm5
        vpunpckhbw      ymm8, ymm8, ymm5
        vpbroadcastd    ymm11, xmm11
        vpunpcklbw      ymm6, ymm9, ymm5
        vpunpcklbw      ymm7, ymm8, ymm5
        vpunpckhbw      ymm9, ymm9, ymm5
        vpunpckhbw      ymm8, ymm8, ymm5
        vpmaddwd        ymm6, ymm6, ymm11
        vpmaddwd        ymm9, ymm9, ymm11
        vpmaddwd        ymm7, ymm7, ymm11
        vpmaddwd        ymm8, ymm8, ymm11
        vpaddd  ymm6, ymm6, ymm4
        vpaddd  ymm9, ymm9, ymm3
        vpaddd  ymm7, ymm7, ymm12
        vpaddd  ymm8, ymm8, ymm0
.L5:
        vpsrad  ymm6, ymm6, xmm1
        vpsrad  ymm9, ymm9, xmm1
        add     r12, rax
        vpsrad  ymm7, ymm7, xmm1
        vpsrad  ymm8, ymm8, xmm1
        vpackssdw       ymm6, ymm6, ymm9
        vpackssdw       ymm7, ymm7, ymm8
        vpackuswb       ymm6, ymm6, ymm7
        vmovdqu YMMWORD PTR [r14+rdx], ymm6
        add     rdx, rax
        cmp     rdx, r15
        jl      .L9
        mov     r10, QWORD PTR 24[rbp]
.L3:
        lea     r15, -8[rsi]
        lea     r12, [r10+r10]
        cmp     r15, rdx
        jle     .L18
        lea     rdi, -2[r8]
        mov     QWORD PTR 24[rbp], r10
        lea     rcx, [rsi+rsi]
        vpxor   xmm7, xmm7, xmm7
        shr     rdi
        mov     rax, QWORD PTR 24[rsp]
        vmovd   xmm1, DWORD PTR 16[rbp]
        lea     rbx, 2[rdi+rdi]
        mov     QWORD PTR 40[rsp], rbx
        lea     r11, [rdx+rax]
        lea     rax, 4[r9+rdi*4]
        add     r11, QWORD PTR 32[rsp]
.L19:
        vmovdqa xmm6, xmm2
        vmovdqa xmm5, xmm2
        xor     edi, edi
        cmp     r8, 1
        jle     .L16
        mov     r10, r11
        mov     rdi, r9
.L13:
        vmovq   xmm3, QWORD PTR [r10+rsi]
        vmovq   xmm0, QWORD PTR [r10]
        add     rdi, 4
        add     r10, rcx
        vpunpcklbw      xmm0, xmm0, xmm3
        vbroadcastss    xmm3, DWORD PTR -4[rdi]
        vpunpcklbw      xmm4, xmm0, xmm7
        vpunpckhbw      xmm0, xmm0, xmm7
        vpmaddwd        xmm4, xmm4, xmm3
        vpmaddwd        xmm0, xmm0, xmm3
        vpaddd  xmm5, xmm4, xmm5
        vpaddd  xmm6, xmm0, xmm6
        cmp     rax, rdi
        jne     .L13
        mov     rdi, QWORD PTR 40[rsp]
.L16:
        cmp     r8, rdi
        jle     .L14
        mov     rbx, rsi
        movsx   r10d, WORD PTR [r9+rdi*2]
        lea     r13, [rdi+rdi]
        imul    rbx, rdi
        add     rdi, 1
        vmovd   xmm4, r10d
        vpshufd xmm4, xmm4, 0
        vmovq   xmm0, QWORD PTR [r11+rbx]
        vpunpcklbw      xmm0, xmm0, xmm7
        vpunpcklbw      xmm3, xmm0, xmm7
        vpunpckhbw      xmm0, xmm0, xmm7
        vpmaddwd        xmm3, xmm3, xmm4
        vpmaddwd        xmm0, xmm0, xmm4
        vpaddd  xmm3, xmm3, xmm5
        vpaddd  xmm0, xmm0, xmm6
        vmovdqa xmm5, xmm3
        vmovdqa xmm6, xmm0
        cmp     r8, rdi
        jle     .L14
        lea     r10, [r11+rsi]
        movsx   edi, WORD PTR 2[r9+r13]
        vmovq   xmm6, QWORD PTR [r10+rbx]
        vmovd   xmm4, edi
        vpunpcklbw      xmm6, xmm6, xmm7
        vpshufd xmm4, xmm4, 0
        vpunpcklbw      xmm5, xmm6, xmm7
        vpunpckhbw      xmm6, xmm6, xmm7
        vpmaddwd        xmm5, xmm5, xmm4
        vpmaddwd        xmm6, xmm6, xmm4
        vpaddd  xmm5, xmm5, xmm3
        vpaddd  xmm6, xmm6, xmm0
.L14:
        vpsrad  xmm5, xmm5, xmm1
        vpsrad  xmm6, xmm6, xmm1
        add     r11, r12
        vpackssdw       xmm5, xmm5, xmm6
        vpackuswb       xmm5, xmm5, xmm5
        vmovq   QWORD PTR [r14+rdx], xmm5
        add     rdx, r12
        cmp     rdx, r15
        jl      .L19
        mov     r10, QWORD PTR 24[rbp]
.L18:
        lea     r13, -4[rsi]
        cmp     r13, rdx
        jle     .L11
        mov     rax, QWORD PTR 24[rsp]
        lea     rdi, -2[r8]
        lea     rcx, [rsi+rsi]
        vpxor   xmm5, xmm5, xmm5
        shr     rdi
        vmovd   xmm1, DWORD PTR 16[rbp]
        vpxor   xmm7, xmm7, xmm7
        vpxor   xmm6, xmm6, xmm6
        lea     r11, [rdx+rax]
        lea     r15, 2[rdi+rdi]
        add     r11, QWORD PTR 32[rsp]
        lea     rax, 4[r9+rdi*4]
.L26:
        mov     rbx, r11
        mov     rdi, r9
        vmovdqa xmm3, xmm2
        cmp     r8, 1
        jle     .L64
.L21:
        vmovd   xmm4, DWORD PTR [rbx+rsi]
        vmovd   xmm0, DWORD PTR [rbx]
        add     rdi, 4
        add     rbx, rcx
        vpunpcklbw      xmm0, xmm0, xmm4
        vbroadcastss    xmm4, DWORD PTR -4[rdi]
        vpunpcklbw      xmm0, xmm0, xmm5
        vpmaddwd        xmm0, xmm0, xmm4
        vpaddd  xmm3, xmm0, xmm3
        cmp     rax, rdi
        jne     .L21
        mov     rdi, r15
.L24:
        cmp     r8, rdi
        jle     .L22
        movsx   r12d, WORD PTR [r9+rdi*2]
        lea     rbx, [rdi+rdi]
        mov     DWORD PTR 40[rsp], r12d
        mov     r12, rsi
        vbroadcastss    xmm4, DWORD PTR 40[rsp]
        imul    r12, rdi
        add     rdi, 1
        vpmovzxbd       xmm0, DWORD PTR [r11+r12]
        vpmaddwd        xmm0, xmm0, xmm4
        vpaddd  xmm3, xmm0, xmm3
        cmp     r8, rdi
        jle     .L22
        movsx   edi, WORD PTR 2[r9+rbx]
        lea     rbx, [r11+rsi]
        vpmovzxbd       xmm0, DWORD PTR [rbx+r12]
        vmovd   xmm4, edi
        vpshufd xmm4, xmm4, 0
        vpmaddwd        xmm0, xmm0, xmm4
        vpaddd  xmm3, xmm3, xmm0
.L22:
        vpsrad  xmm3, xmm3, xmm1
        add     r11, r10
        vpackssdw       xmm3, xmm3, xmm7
        vpackuswb       xmm3, xmm3, xmm6
        vmovd   DWORD PTR [r14+rdx], xmm3
        add     rdx, r10
        cmp     rdx, r13
        jl      .L26
.L11:
        cmp     rsi, rdx
        jle     .L59
        lea     rbx, [r9+r8*2]
        mov     r11, QWORD PTR 24[rsp]
        lea     rdi, -3[r8]
        vmovd   xmm1, DWORD PTR 16[rbp]
        mov     QWORD PTR 24[rsp], rbx
        shr     rdi
        lea     rcx, [rsi+rsi]
        vpxor   xmm4, xmm4, xmm4
        lea     rax, 4[r9+rdi*4]
        vpxor   xmm6, xmm6, xmm6
        vpxor   xmm5, xmm5, xmm5
        add     r11, rdx
        lea     r13, 2[rdi+rdi]
        add     r11, QWORD PTR 32[rsp]
.L38:
        mov     rbx, r11
        mov     rdi, r9
        vmovdqa xmm3, xmm2
        cmp     r8, 2
        jle     .L65
.L27:
        vmovd   xmm7, DWORD PTR [rbx+rsi]
        vmovd   xmm0, DWORD PTR [rbx]
        add     rdi, 4
        add     rbx, rcx
        vpunpcklbw      xmm0, xmm0, xmm7
        vbroadcastss    xmm7, DWORD PTR -4[rdi]
        vpunpcklbw      xmm0, xmm0, xmm4
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm3, xmm0, xmm3
        cmp     rdi, rax
        jne     .L27
        mov     rdi, r13
.L30:
        cmp     r8, rdi
        jle     .L66
        cmp     r10, 3
        je      .L31
        mov     r12, rsi
        movsx   ebx, WORD PTR [r9+rdi*2]
        lea     r15, [rdi+rdi]
        imul    r12, rdi
        vmovd   xmm7, ebx
        lea     rbx, 1[rdi]
        vpshufd xmm7, xmm7, 0
        vpmovzxbd       xmm0, DWORD PTR [r11+r12]
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm3, xmm0, xmm3
        vmovdqa xmm0, xmm3
        cmp     r8, rbx
        jle     .L32
        movsx   ebx, WORD PTR 2[r9+r15]
        add     r12, rsi
        add     rdi, 2
        vpmovzxbd       xmm0, DWORD PTR [r11+r12]
        vmovd   xmm7, ebx
        vpshufd xmm7, xmm7, 0
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm3, xmm0, xmm3
        vmovdqa xmm0, xmm3
        cmp     r8, rdi
        jle     .L32
        movsx   edi, WORD PTR 4[r9+r15]
        lea     rbx, [r11+rsi]
        vpmovzxbd       xmm0, DWORD PTR [rbx+r12]
        vmovd   xmm7, edi
        vpshufd xmm7, xmm7, 0
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm0, xmm0, xmm3
.L32:
        vpsrad  xmm0, xmm0, xmm1
        vpackssdw       xmm0, xmm0, xmm6
        vpackuswb       xmm0, xmm0, xmm5
        vmovd   edi, xmm0
.L33:
        mov     DWORD PTR [r14+rdx], edi
.L36:
        add     rdx, r10
        add     r11, r10
        cmp     rsi, rdx
        jg      .L38
.L59:
        vzeroupper
        lea     rsp, -40[rbp]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret
.L66:
        vpsrad  xmm0, xmm3, xmm1
        lea     r12, [r14+rdx]
        vpackssdw       xmm0, xmm0, xmm6
        vpackuswb       xmm0, xmm0, xmm5
        vmovd   edi, xmm0
        vmovd   DWORD PTR 60[rsp], xmm0
        cmp     r10, 3
        jne     .L33
.L35:
        lea     rbx, 4[rdx]
        cmp     rsi, rbx
        jg      .L33
        movzx   edi, WORD PTR 60[rsp]
        mov     WORD PTR [r12], di
        movzx   edi, BYTE PTR 62[rsp]
        mov     BYTE PTR 2[r12], dil
        jmp     .L36
.L31:
        lea     r12, [r9+rdi*2]
        mov     QWORD PTR 32[rsp], rax
        mov     rbx, QWORD PTR 24[rsp]
        lea     r15, 60[rsp]
        imul    rdi, rsi
        add     rdi, r11
.L34:
        movsx   eax, WORD PTR [r12]
        add     r12, 2
        mov     DWORD PTR 40[rsp], eax
        movzx   eax, WORD PTR [rdi]
        vbroadcastss    xmm7, DWORD PTR 40[rsp]
        mov     WORD PTR [r15], ax
        movzx   eax, BYTE PTR 2[rdi]
        add     rdi, rsi
        mov     BYTE PTR 2[r15], al
        vpmovzxbd       xmm0, DWORD PTR 60[rsp]
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm0, xmm0, xmm3
        vmovdqa xmm3, xmm0
        cmp     rbx, r12
        jne     .L34
        vpsrad  xmm0, xmm0, xmm1
        mov     rax, QWORD PTR 32[rsp]
        lea     r12, [r14+rdx]
        vpackssdw       xmm0, xmm0, xmm6
        vpackuswb       xmm0, xmm0, xmm5
        vmovd   edi, xmm0
        vmovd   DWORD PTR 60[rsp], xmm0
        jmp     .L35
.L65:
        xor     edi, edi
        jmp     .L30
.L64:
        xor     edi, edi
        jmp     .L24
.L63:
        vmovdqa ymm8, ymm10
        vmovdqa ymm7, ymm10
        vmovdqa ymm9, ymm10
        xor     r11d, r11d
        vmovdqa ymm6, ymm10
        jmp     .L7
.L39:
        xor     edx, edx
        jmp     .L3
.L62:
        lea     rcx, .LC0[rip]
        mov     edx, 38
        lea     rsi, .LC1[rip]
        lea     rdi, .LC2[rip]
        call    __assert_fail@PLT
foo(unsigned char*, unsigned char const*, long, long, long, short const*, unsigned int):
        sub     rsp, 8
        push    3
        mov     eax, DWORD PTR 24[rsp]
        push    rax
        call    ImagingResampleVerticalConvolution8u(unsigned char*, unsigned char const*, long, long, long, short const*, unsigned int, long)@PLT
        add     rsp, 24
        ret
  • Assembly for source 2
    • compiler: x86-64 GCC 9.4
    • Flags: -std=c++17 -O3 -mavx -mfma -mavx2 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -Wno-deprecated -fvisibility-inlines-hidden -fopenmp -O3 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow
.LC0:
        .string "void ImagingResampleVerticalConvolution8u(uint8_t*, const uint8_t*, int64_t, int64_t, int64_t, const int16_t*, unsigned int, int64_t)"
.LC1:
        .string "/app/example.cpp"
.LC2:
        .string "stride == 3 || stride == 4"
ImagingResampleVerticalConvolution8u(unsigned char*, unsigned char const*, long, long, long, short const*, unsigned int, long):
        push    rbp
        mov     rbp, rsp
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        and     rsp, -32
        sub     rsp, 64
        mov     r10, QWORD PTR 24[rbp]
        mov     QWORD PTR 32[rsp], rsi
        lea     rax, -3[r10]
        mov     QWORD PTR 24[rsp], rcx
        cmp     rax, 1
        ja      .L62
        mov     eax, DWORD PTR 16[rbp]
        mov     rsi, rdx
        mov     r14, rdi
        imul    rsi, r10
        lea     ecx, -1[rax]
        mov     eax, 1
        sal     eax, cl
        vmovd   xmm2, eax
        vmovd   xmm10, eax
        mov     eax, 32
        lea     r15, -32[rsi]
        vpshufd xmm2, xmm2, 0
        vpbroadcastd    ymm10, xmm10
        cqo
        idiv    r10
        imul    rax, r10
        test    r15, r15
        jle     .L39
        lea     rdx, -2[r8]
        mov     QWORD PTR 24[rbp], r10
        lea     rdi, [rsi+rsi]
        vpxor   xmm5, xmm5, xmm5
        shr     rdx
        vmovd   xmm1, DWORD PTR 16[rbp]
        mov     r12, QWORD PTR 32[rsp]
        lea     rcx, 4[r9+rdx*4]
        lea     rdx, 2[rdx+rdx]
        add     r12, QWORD PTR 24[rsp]
        mov     QWORD PTR 40[rsp], rdx
        xor     edx, edx
.L9:
        cmp     r8, 1
        jle     .L63
        mov     r11, r12
        vmovdqa ymm8, ymm10
        vmovdqa ymm7, ymm10
        mov     r10, r9
        vmovdqa ymm9, ymm10
        vmovdqa ymm6, ymm10
.L4:
        vmovdqu ymm3, YMMWORD PTR [r11]
        vpunpcklbw      ymm0, ymm3, YMMWORD PTR [r11+rsi]
        add     r10, 4
        vpbroadcastd    ymm3, DWORD PTR -4[r10]
        vpunpcklbw      ymm4, ymm0, ymm5
        vpunpckhbw      ymm0, ymm0, ymm5
        vpmaddwd        ymm4, ymm4, ymm3
        vpmaddwd        ymm0, ymm0, ymm3
        vpaddd  ymm6, ymm4, ymm6
        vpaddd  ymm9, ymm0, ymm9
        vmovdqu ymm4, YMMWORD PTR [r11]
        vpunpckhbw      ymm0, ymm4, YMMWORD PTR [r11+rsi]
        add     r11, rdi
        vpunpcklbw      ymm4, ymm0, ymm5
        vpunpckhbw      ymm0, ymm0, ymm5
        vpmaddwd        ymm4, ymm4, ymm3
        vpmaddwd        ymm0, ymm0, ymm3
        vpaddd  ymm7, ymm4, ymm7
        vpaddd  ymm8, ymm0, ymm8
        cmp     r10, rcx
        jne     .L4
        mov     r11, QWORD PTR 40[rsp]
.L7:
        cmp     r8, r11
        jle     .L5
        mov     rbx, rsi
        movsx   r10d, WORD PTR [r9+r11*2]
        lea     r13, [r11+r11]
        imul    rbx, r11
        add     r11, 1
        vmovd   xmm11, r10d
        vpbroadcastd    ymm11, xmm11
        vmovdqu ymm3, YMMWORD PTR [r12+rbx]
        vmovdqu ymm0, YMMWORD PTR [r12+rbx]
        vpunpcklbw      ymm3, ymm3, ymm5
        vpunpckhbw      ymm0, ymm0, ymm5
        vpunpcklbw      ymm4, ymm3, ymm5
        vpunpcklbw      ymm12, ymm0, ymm5
        vpunpckhbw      ymm3, ymm3, ymm5
        vpunpckhbw      ymm0, ymm0, ymm5
        vpmaddwd        ymm4, ymm4, ymm11
        vpmaddwd        ymm3, ymm3, ymm11
        vpmaddwd        ymm12, ymm12, ymm11
        vpmaddwd        ymm0, ymm0, ymm11
        vpaddd  ymm4, ymm4, ymm6
        vpaddd  ymm3, ymm3, ymm9
        vpaddd  ymm12, ymm12, ymm7
        vpaddd  ymm0, ymm0, ymm8
        vmovdqa ymm6, ymm4
        vmovdqa ymm9, ymm3
        vmovdqa ymm7, ymm12
        vmovdqa ymm8, ymm0
        cmp     r8, r11
        jle     .L5
        lea     r11, [r12+rsi]
        movsx   r10d, WORD PTR 2[r9+r13]
        vmovdqu ymm8, YMMWORD PTR [r11+rbx]
        vmovd   xmm11, r10d
        vpunpcklbw      ymm9, ymm8, ymm5
        vpunpckhbw      ymm8, ymm8, ymm5
        vpbroadcastd    ymm11, xmm11
        vpunpcklbw      ymm6, ymm9, ymm5
        vpunpcklbw      ymm7, ymm8, ymm5
        vpunpckhbw      ymm9, ymm9, ymm5
        vpunpckhbw      ymm8, ymm8, ymm5
        vpmaddwd        ymm6, ymm6, ymm11
        vpmaddwd        ymm9, ymm9, ymm11
        vpmaddwd        ymm7, ymm7, ymm11
        vpmaddwd        ymm8, ymm8, ymm11
        vpaddd  ymm6, ymm6, ymm4
        vpaddd  ymm9, ymm9, ymm3
        vpaddd  ymm7, ymm7, ymm12
        vpaddd  ymm8, ymm8, ymm0
.L5:
        vpsrad  ymm6, ymm6, xmm1
        vpsrad  ymm9, ymm9, xmm1
        add     r12, rax
        vpsrad  ymm7, ymm7, xmm1
        vpsrad  ymm8, ymm8, xmm1
        vpackssdw       ymm6, ymm6, ymm9
        vpackssdw       ymm7, ymm7, ymm8
        vpackuswb       ymm6, ymm6, ymm7
        vmovdqu YMMWORD PTR [r14+rdx], ymm6
        add     rdx, rax
        cmp     rdx, r15
        jl      .L9
        mov     r10, QWORD PTR 24[rbp]
.L3:
        lea     r15, -8[rsi]
        lea     r12, [r10+r10]
        cmp     r15, rdx
        jle     .L18
        lea     rdi, -2[r8]
        mov     QWORD PTR 24[rbp], r10
        lea     rcx, [rsi+rsi]
        vpxor   xmm7, xmm7, xmm7
        shr     rdi
        mov     rax, QWORD PTR 24[rsp]
        vmovd   xmm1, DWORD PTR 16[rbp]
        lea     rbx, 2[rdi+rdi]
        mov     QWORD PTR 40[rsp], rbx
        lea     r11, [rdx+rax]
        lea     rax, 4[r9+rdi*4]
        add     r11, QWORD PTR 32[rsp]
.L19:
        vmovdqa xmm6, xmm2
        vmovdqa xmm5, xmm2
        xor     edi, edi
        cmp     r8, 1
        jle     .L16
        mov     r10, r11
        mov     rdi, r9
.L13:
        vmovq   xmm3, QWORD PTR [r10+rsi]
        vmovq   xmm0, QWORD PTR [r10]
        add     rdi, 4
        add     r10, rcx
        vpunpcklbw      xmm0, xmm0, xmm3
        vbroadcastss    xmm3, DWORD PTR -4[rdi]
        vpunpcklbw      xmm4, xmm0, xmm7
        vpunpckhbw      xmm0, xmm0, xmm7
        vpmaddwd        xmm4, xmm4, xmm3
        vpmaddwd        xmm0, xmm0, xmm3
        vpaddd  xmm5, xmm4, xmm5
        vpaddd  xmm6, xmm0, xmm6
        cmp     rax, rdi
        jne     .L13
        mov     rdi, QWORD PTR 40[rsp]
.L16:
        cmp     r8, rdi
        jle     .L14
        mov     rbx, rsi
        movsx   r10d, WORD PTR [r9+rdi*2]
        lea     r13, [rdi+rdi]
        imul    rbx, rdi
        add     rdi, 1
        vmovd   xmm4, r10d
        vpshufd xmm4, xmm4, 0
        vmovq   xmm0, QWORD PTR [r11+rbx]
        vpunpcklbw      xmm0, xmm0, xmm7
        vpunpcklbw      xmm3, xmm0, xmm7
        vpunpckhbw      xmm0, xmm0, xmm7
        vpmaddwd        xmm3, xmm3, xmm4
        vpmaddwd        xmm0, xmm0, xmm4
        vpaddd  xmm3, xmm3, xmm5
        vpaddd  xmm0, xmm0, xmm6
        vmovdqa xmm5, xmm3
        vmovdqa xmm6, xmm0
        cmp     r8, rdi
        jle     .L14
        lea     r10, [r11+rsi]
        movsx   edi, WORD PTR 2[r9+r13]
        vmovq   xmm6, QWORD PTR [r10+rbx]
        vmovd   xmm4, edi
        vpunpcklbw      xmm6, xmm6, xmm7
        vpshufd xmm4, xmm4, 0
        vpunpcklbw      xmm5, xmm6, xmm7
        vpunpckhbw      xmm6, xmm6, xmm7
        vpmaddwd        xmm5, xmm5, xmm4
        vpmaddwd        xmm6, xmm6, xmm4
        vpaddd  xmm5, xmm5, xmm3
        vpaddd  xmm6, xmm6, xmm0
.L14:
        vpsrad  xmm5, xmm5, xmm1
        vpsrad  xmm6, xmm6, xmm1
        add     r11, r12
        vpackssdw       xmm5, xmm5, xmm6
        vpackuswb       xmm5, xmm5, xmm5
        vmovq   QWORD PTR [r14+rdx], xmm5
        add     rdx, r12
        cmp     rdx, r15
        jl      .L19
        mov     r10, QWORD PTR 24[rbp]
.L18:
        lea     r13, -4[rsi]
        cmp     r13, rdx
        jle     .L11
        mov     rax, QWORD PTR 24[rsp]
        lea     rdi, -2[r8]
        lea     rcx, [rsi+rsi]
        vpxor   xmm5, xmm5, xmm5
        shr     rdi
        vmovd   xmm1, DWORD PTR 16[rbp]
        vpxor   xmm7, xmm7, xmm7
        vpxor   xmm6, xmm6, xmm6
        lea     r11, [rdx+rax]
        lea     r15, 2[rdi+rdi]
        add     r11, QWORD PTR 32[rsp]
        lea     rax, 4[r9+rdi*4]
.L26:
        mov     rbx, r11
        mov     rdi, r9
        vmovdqa xmm3, xmm2
        cmp     r8, 1
        jle     .L64
.L21:
        vmovd   xmm4, DWORD PTR [rbx+rsi]
        vmovd   xmm0, DWORD PTR [rbx]
        add     rdi, 4
        add     rbx, rcx
        vpunpcklbw      xmm0, xmm0, xmm4
        vbroadcastss    xmm4, DWORD PTR -4[rdi]
        vpunpcklbw      xmm0, xmm0, xmm5
        vpmaddwd        xmm0, xmm0, xmm4
        vpaddd  xmm3, xmm0, xmm3
        cmp     rax, rdi
        jne     .L21
        mov     rdi, r15
.L24:
        cmp     r8, rdi
        jle     .L22
        movsx   r12d, WORD PTR [r9+rdi*2]
        lea     rbx, [rdi+rdi]
        mov     DWORD PTR 40[rsp], r12d
        mov     r12, rsi
        vbroadcastss    xmm4, DWORD PTR 40[rsp]
        imul    r12, rdi
        add     rdi, 1
        vpmovzxbd       xmm0, DWORD PTR [r11+r12]
        vpmaddwd        xmm0, xmm0, xmm4
        vpaddd  xmm3, xmm0, xmm3
        cmp     r8, rdi
        jle     .L22
        movsx   edi, WORD PTR 2[r9+rbx]
        lea     rbx, [r11+rsi]
        vpmovzxbd       xmm0, DWORD PTR [rbx+r12]
        vmovd   xmm4, edi
        vpshufd xmm4, xmm4, 0
        vpmaddwd        xmm0, xmm0, xmm4
        vpaddd  xmm3, xmm3, xmm0
.L22:
        vpsrad  xmm3, xmm3, xmm1
        add     r11, r10
        vpackssdw       xmm3, xmm3, xmm7
        vpackuswb       xmm3, xmm3, xmm6
        vmovd   DWORD PTR [r14+rdx], xmm3
        add     rdx, r10
        cmp     rdx, r13
        jl      .L26
.L11:
        cmp     rsi, rdx
        jle     .L59
        lea     rbx, [r9+r8*2]
        mov     r11, QWORD PTR 24[rsp]
        lea     rdi, -3[r8]
        vmovd   xmm1, DWORD PTR 16[rbp]
        mov     QWORD PTR 24[rsp], rbx
        shr     rdi
        lea     rcx, [rsi+rsi]
        vpxor   xmm4, xmm4, xmm4
        lea     rax, 4[r9+rdi*4]
        vpxor   xmm6, xmm6, xmm6
        vpxor   xmm5, xmm5, xmm5
        add     r11, rdx
        lea     r13, 2[rdi+rdi]
        add     r11, QWORD PTR 32[rsp]
.L38:
        mov     rbx, r11
        mov     rdi, r9
        vmovdqa xmm3, xmm2
        cmp     r8, 2
        jle     .L65
.L27:
        vmovd   xmm7, DWORD PTR [rbx+rsi]
        vmovd   xmm0, DWORD PTR [rbx]
        add     rdi, 4
        add     rbx, rcx
        vpunpcklbw      xmm0, xmm0, xmm7
        vbroadcastss    xmm7, DWORD PTR -4[rdi]
        vpunpcklbw      xmm0, xmm0, xmm4
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm3, xmm0, xmm3
        cmp     rdi, rax
        jne     .L27
        mov     rdi, r13
.L30:
        cmp     r8, rdi
        jle     .L66
        cmp     r10, 3
        je      .L31
        mov     r12, rsi
        movsx   ebx, WORD PTR [r9+rdi*2]
        lea     r15, [rdi+rdi]
        imul    r12, rdi
        vmovd   xmm7, ebx
        lea     rbx, 1[rdi]
        vpshufd xmm7, xmm7, 0
        vpmovzxbd       xmm0, DWORD PTR [r11+r12]
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm3, xmm0, xmm3
        vmovdqa xmm0, xmm3
        cmp     r8, rbx
        jle     .L32
        movsx   ebx, WORD PTR 2[r9+r15]
        add     r12, rsi
        add     rdi, 2
        vpmovzxbd       xmm0, DWORD PTR [r11+r12]
        vmovd   xmm7, ebx
        vpshufd xmm7, xmm7, 0
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm3, xmm0, xmm3
        vmovdqa xmm0, xmm3
        cmp     r8, rdi
        jle     .L32
        movsx   edi, WORD PTR 4[r9+r15]
        lea     rbx, [r11+rsi]
        vpmovzxbd       xmm0, DWORD PTR [rbx+r12]
        vmovd   xmm7, edi
        vpshufd xmm7, xmm7, 0
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm0, xmm0, xmm3
.L32:
        vpsrad  xmm0, xmm0, xmm1
        vpackssdw       xmm0, xmm0, xmm6
        vpackuswb       xmm0, xmm0, xmm5
        vmovd   edi, xmm0
.L33:
        mov     DWORD PTR [r14+rdx], edi
.L36:
        add     rdx, r10
        add     r11, r10
        cmp     rsi, rdx
        jg      .L38
.L59:
        vzeroupper
        lea     rsp, -40[rbp]
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret
.L66:
        vpsrad  xmm0, xmm3, xmm1
        lea     r12, [r14+rdx]
        vpackssdw       xmm0, xmm0, xmm6
        vpackuswb       xmm0, xmm0, xmm5
        vmovd   edi, xmm0
        vmovd   DWORD PTR 60[rsp], xmm0
        cmp     r10, 3
        jne     .L33
.L35:
        lea     rbx, 4[rdx]
        cmp     rsi, rbx
        jg      .L33
        movzx   edi, WORD PTR 60[rsp]
        mov     WORD PTR [r12], di
        movzx   edi, BYTE PTR 62[rsp]
        mov     BYTE PTR 2[r12], dil
        jmp     .L36
.L31:
        lea     r12, [r9+rdi*2]
        mov     QWORD PTR 32[rsp], rax
        mov     rbx, QWORD PTR 24[rsp]
        lea     r15, 60[rsp]
        imul    rdi, rsi
        add     rdi, r11
.L34:
        movsx   eax, WORD PTR [r12]
        add     r12, 2
        mov     DWORD PTR 40[rsp], eax
        movzx   eax, WORD PTR [rdi]
        vbroadcastss    xmm7, DWORD PTR 40[rsp]
        mov     WORD PTR [r15], ax
        movzx   eax, BYTE PTR 2[rdi]
        add     rdi, rsi
        mov     BYTE PTR 2[r15], al
        vpmovzxbd       xmm0, DWORD PTR 60[rsp]
        vpmaddwd        xmm0, xmm0, xmm7
        vpaddd  xmm0, xmm0, xmm3
        vmovdqa xmm3, xmm0
        cmp     rbx, r12
        jne     .L34
        vpsrad  xmm0, xmm0, xmm1
        mov     rax, QWORD PTR 32[rsp]
        lea     r12, [r14+rdx]
        vpackssdw       xmm0, xmm0, xmm6
        vpackuswb       xmm0, xmm0, xmm5
        vmovd   edi, xmm0
        vmovd   DWORD PTR 60[rsp], xmm0
        jmp     .L35
.L65:
        xor     edi, edi
        jmp     .L30
.L64:
        xor     edi, edi
        jmp     .L24
.L63:
        vmovdqa ymm8, ymm10
        vmovdqa ymm7, ymm10
        vmovdqa ymm9, ymm10
        xor     r11d, r11d
        vmovdqa ymm6, ymm10
        jmp     .L7
.L39:
        xor     edx, edx
        jmp     .L3
.L62:
        lea     rcx, .LC0[rip]
        mov     edx, 38
        lea     rsi, .LC1[rip]
        lea     rdi, .LC2[rip]
        call    __assert_fail@PLT
foo(unsigned char*, unsigned char const*, long, long, long, short const*, unsigned int):
        sub     rsp, 8
        push    3
        mov     eax, DWORD PTR 24[rsp]
        push    rax
        call    ImagingResampleVerticalConvolution8u(unsigned char*, unsigned char const*, long, long, long, short const*, unsigned int, long)@PLT
        add     rsp, 24
        ret