Vector instructions. Part II: Vectorization

  1. template <int SIZE>
  2. void copy_mb(const uint8_t * src,
  3. uint8_t * dst,
  4. size_t src_stride,
  5. size_t dst_stride)
  6. {
  7. for(int i = 0; i < SIZE; i++)
  8. {
  9. for(int j = 0; j < SIZE; j++)
  10. {
  11. dst[ j ] = src[ j ];
  12. }
  13. src += src_stride;
  14. dst += dst_stride;
  15. }
  16. }
  1. #include <emmintrin.h>
  2. void copy_mb_4(const uint8_t * src,
  3. uint8_t * dst,
  4. size_t src_stride,
  5. size_t dst_stride)
  6. {
  7. __m128i x0;
  8. for(int i = 0; i < 4; i++)
  9. {
  10. x0 = _mm_cvtsi32_si128(*(int32_t*) src);
  11. *(int32_t*) dst = _mm_cvtsi128_si32(x0);
  12. src += src_stride;
  13. dst += dst_stride;
  14. }
  15. }// copy_mb_4
  16. void copy_mb_8(const uint8_t * src,
  17. uint8_t * dst,
  18. size_t src_stride,
  19. size_t dst_stride)
  20. {
  21. __m128i x0;
  22. for(int i = 0; i < 8; i++)
  23. {
  24. x0 = _mm_loadl_epi64((__m128i*)src);
  25. _mm_storel_epi64((_m128i*)dst, x0);
  26. src += src_stride;
  27. dst += dst_stride;
  28. }
  29. }// copy_mb_8
  30. void copy_mb_16(const uint8_t * src,
  31. uint8_t * dst,
  32. size_t src_stride,
  33. size_t dst_stride)
  34. {
  35. __m128i x0;
  36. for(int i = 0; i < 16; i++)
  37. {
  38. x0 = _mm_loadu_si128((__m128i*)src);
  39. _mm_storeu_si128((_m128i*)dst, x0);
  40. src += src_stride;
  41. dst += dst_stride;
  42. }
  43. }// copy_mb_16
  1. #include <emmintrin.h>
  2. void copy_mb_4(const uint8_t * src,
  3. uint8_t * dst,
  4. size_t src_stride,
  5. size_t dst_stride)
  6. {
  7. __m128i x0, x1, x2, x3;
  8. x0 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * src_stride));
  9. x1 = _mm_cvtsi32_si128(*(int32_t*)(src + 1 * src_stride));
  10. x2 = _mm_cvtsi32_si128(*(int32_t*)(src + 2 * src_stride));
  11. x3 = _mm_cvtsi32_si128(*(int32_t*)(src + 3 * src_stride));
  12. *(int32_t*)(dst + 0 * dst_stride) = _mm_cvtsi128_si32(x0);
  13. *(int32_t*)(dst + 1 * dst_stride) = _mm_cvtsi128_si32(x1);
  14. *(int32_t*)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(x2);
  15. *(int32_t*)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(x3);
  16. }
  1. template <int SIZE>
  2. void compensate_mb(const uint16_t * src,
  3. uint16_t * dst,
  4. size_t src_stride,
  5. size_t dst_stride)
  6. {
  7. for(int i = 0; i < SIZE; i++)
  8. {
  9. for(int j = 0; j < SIZE; j++)
  10. {
  11. dst[ j ] = dst[ j ] + src[ j ];
  12. }
  13. src += src_stride;
  14. dst += dst_stride;
  15. }
  16. }
  1. void compensate_8(const uint16_t * src
  2. uint16_t * dst,
  3. size_t src_stride,
  4. size_t dst_stride)
  5. {
  6. __m128i x0, x1;
  7. for(int i = 0; i < 8; i++)
  8. {
  9. x0 = _mm_loadu_si128((__m128i*)src); // 8 pixels
  10. x1 = _mm_loadu_si128((__m128i*)dst);
  11. x0 = _mm_add_epi16( x0, x1);
  12. _mm_storeu_si128((_m128i*)dst, x0);
  13. src += src_stride;
  14. dst += dst_stride;
  15. }
  16. }
  1. template <int SIZE>
  2. void compensate_mb(const int16_t * src,
  3. uint8_t * dst,
  4. size_t src_stride,
  5. size_t dst_stride)
  6. {
  7. for(int i = 0; i < SIZE; i++)
  8. {
  9. for(int j = 0; j < SIZE; j++)
  10. {
  11. int tmp = dst[ j ] + src[ j ];
  12. if(tmp > MAX(uint8_t))
  13. dst[ j ] = MAX(uint8_t);
  14. else if (tmp < 0)
  15. dst[ j ] = 0;
  16. else
  17. dst[ j ] = tmp;
  18. }
  19. src += src_stride;
  20. dst += dst_stride;
  21. }
  22. }
  1. void compensate_8(const int16_t * src,
  2. uint8_t * dst,
  3. size_t src_stride,
  4. size_t dst_stride)
  5. {
  6. __m128i x0, x1, zero;
  7. zero = _mm_setzero_si128();
  8. for(int i = 0; i < 8; i++)
  9. {
  10. x0 = _mm_loadu_si128((__m128i*)src); // 8 pixels
  11. x1 = _mm_loadl_epi64((__m128i*)dst); // 8 bit !
  12. x1 = _mm_unpacklo_epi8(x1, zero); // from 8 to 16 bit
  13. x0 = _mm_add_epi16( x0, x1);
  14. x0 = _mm_packus_epi16(x0, x0); // back to 8 bit
  15. _mm_storel_epi64((_m128i*)dst, x0);
  16. src += src_stride;
  17. dst += dst_stride;
  18. }
  19. }
  1. void compensate_4(const int32_t * src,
  2. uint16_t *dst,
  3. size_t src_stride,
  4. size_t dst_stride,
  5. int bitdepth)
  6. {
  7. __m128i x0, x1, zero, max_val;
  8. zero = _mm_setzero_si128();
  9. max_val = _mm_set1_epi16((1 << bitdepth) — 1);
  10. for(int i = 0; i < 4; i++)
  11. {
  12. x0 = _mm_loadu_si128((__m128i*)src); // 4 x 32
  13. x1 = _mm_loadl_epi64((__m128i*)dst); // 4 x 16
  14. x1 = _mm_unpacklo_epi16(x1, zero); // from 16 bit to 32 bit
  15. x0 = _mm_add_epi32(x0, x1);
  16. x0 = _mm_packs_epi32( x0, x0 ); // from 32 bit to 16 bit
  17. /* if x0[k] < max_val, then x0[k]. else max_val */
  18. x0 = _mm_min_epi16(x0, max_val);
  19. x0 = _mm_max_epi16(x0, zero);
  20. _mm_storel_epi64((_m128i*)dst, x0);
  21. src += src_stride;
  22. dst += dst_stride;
  23. }
  24. }
  1. #include <emmintrin.h>
  2. #include <stdint.h>
  3. int32_t sad_16_8bit(const uint8_t* src0,
  4. const uint8_t* src1,
  5. size_t src0_stride,
  6. size_t src1_stride)
  7. {
  8. __m128i x0, x1, sum;
  9. sum = _mm_setzero_si128();
  10. for(int i = 0; i < 16; i++)
  11. {
  12. x0 = _mm_loadu_si128((__m128i*)src0);
  13. x1 = _mm_loadu_si128((__m128i*)src1);
  14. x0 = _mm_sad_epu8(x0, x1);
  15. sum = _mm_add_epi32(sum, x0); // sum for lower and upper halves
  16. src0 += src0_stride;
  17. src1 += src1_stride;
  18. }
  19. x0 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1,0,3,2));
  20. sum = _mm_add_epi32(sum, x0);
  21. int32_t s = _mm_cvtsi128_si32(sum); // result
  22. return s;
  23. }
  1. __m128i x0, x1, x2, sum, zero;
  2. zero = _mm_setzero_si128();
  3. sum = zero;
  4. for(int i = 0; i < 8; i++)
  5. {
  6. x0 = _mm_loadu_si128((__m128i*)src0);
  7. x1 = _mm_loadu_si128((__m128i*)src1);
  8. /* | x0 — x1 | */
  9. x2 = x0;
  10. x0 = _mm_subs_epu16(x0, x1);
  11. x1 = _mm_subs_epu16(x1, x2);
  12. x0 = _mm_xor_si128(x0, x1);
  13. x1 = x0;
  14. x0 = _mm_unpacklo_epi16(x0, zero); // 16 bit to 32 bit
  15. x1 = _mm_unpackhi_epi16(x1, zero);
  16. sum = _mm_add_epi32(sum, x0);
  17. sum = _mm_add_epi32(sum, x1);
  18. src0 += src0_stride;
  19. src1 += src1_stride;
  20. }
  21. /* sum is a0,a1,a2,a3 */
  22. x0 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(2,3,0,1)); // x0 is a1,a0,a3,a2
  23. sum = _mm_add_epi32(sum, x0);
  24. x0 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1,0,3,2));
  25. sum = _mm_add_epi32(sum, x0);
  26. int32_t s = _mm_cvtsi128_si32(sum); // result
  1. __m128i x0, x1, x2, sum, zero;
  2. zero = _mm_setzero_si128();
  3. sum = zero;
  4. for(int i = 0; i < 8; i++)
  5. {
  6. x0 = _mm_loadl_epi64((__m128i*)src0);
  7. x1 = _mm_loadl_epi64((__m128i*)src1);
  8. x0 = _mm_unpacklo_epi8(x0, zero); // 8 to 16 bit
  9. x1 = _mm_unpacklo_epi8(x1, zero);
  10. x0 = _mm_sub_epi16(x0, x1); // x0 — x1
  11. x0 = _mm_madd_epi16(x0, x0); // (x0 — x1)²
  12. sum = _mm_add_epi32(sum, x0);
  13. src0 += src0_stride;
  14. src1 += src1_stride;
  15. }
  16. // sum of sum elements
  1. __m128i x0, x1, x2, sum, zero;
  2. zero = _mm_setzero_si128();
  3. sum = zero;
  4. for(int i = 0; i < 8; i++)
  5. {
  6. x0 = _mm_loadu_si128((__m128i*)src0);
  7. x1 = _mm_loadu_si128((__m128i*)src1);
  8. /* | x0 — x1 | */
  9. x2 = x0;
  10. x0 = _mm_subs_epu16(x0, x1);
  11. x1 = _mm_subs_epu16(x1, x2);
  12. x0 = _mm_xor_si128(x0, x1);
  13. /* x0² */
  14. x1 = x0;
  15. x0 = _mm_mullo_epi16( x0, x0 );
  16. x1 = _mm_mulhi_epu16( x1, x1 );
  17. x2 = x0;
  18. x0 = _mm_unpacklo_epi16( x0, x1 ); // x0[ i ]², i = 0..3
  19. x2 = _mm_unpackhi_epi16( x2, x1 ); // x0[ i ]², i = 4..7
  20. x0 = _mm_add_epi32( x0, x2 );
  21. x2 = x0;
  22. x0 = _mm_unpacklo_epi32(x0, zero); // from 32 to 64 bit
  23. x2 = _mm_unpackhi_epi32(x0, zero);
  24. sum = _mm_add_epi64(sum, x0);
  25. sum = _mm_add_epi64(sum, x2);
  26. src0 += src0_stride;
  27. src1 += src1_stride;
  28. }
  29. // sum of sum elements
  30. x0 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1,0,3,2));
  31. sum = _mm_add_epi64(sum, x0);
  32. uint64_t result;
  33. _mm_storel_epi64((__m128i*)&result, sum);
  1. #include <emmintrin.h>
  2. #include <stdint.h>
  3. #include <stdlib.h>
  4. uint64_t sad_8bit(const uint8_t* src0,
  5. const uint8_t* src1,
  6. size_t width,
  7. size_t height,
  8. size_t src0_stride,
  9. size_t src1_stride)
  10. {
  11. size_t width16 = width — (width % 16); // width16 == 16*x
  12. __m128i x0, x1, sum;
  13. sum = _mm_setzero_si128();
  14. uint64_t sum_tail = 0;
  15. for(int i = 0; i < height; i++)
  16. {
  17. for(int j = 0; j < width16; j += 16)
  18. {
  19. x0 = _mm_loadu_si128((__m128i*)(src0 + j));
  20. x1 = _mm_loadu_si128((__m128i*)(src1 + j));
  21. x0 = _mm_sad_epu8(x0, x1);
  22. sum = _mm_add_epi64(sum, x0);
  23. }
  24. for(int j = width16; j < width; j ++)
  25. {
  26. sum_tail += abs(src0[j] — src1[j]);
  27. }
  28. src0 += src0_stride;
  29. src1 += src1_stride;
  30. }
  31. x0 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1,0,3,2));
  32. sum = _mm_add_epi64(sum, x0);
  33. uint64_t sum_total;
  34. _mm_storel_epi64((__m128i*)&sum_total, sum);
  35. sum_total += sum_tail;
  36. return sum_total;
  37. }
  1. #include <emmintrin.h>
  2. #include <stdint.h>
  3. #include <stdlib.h>
  4. uint64_t sad_8bit(const uint8_t* src0,
  5. const uint8_t* src1,
  6. size_t width,
  7. size_t height,
  8. size_t src0_stride,
  9. size_t src1_stride)
  10. {
  11. size_t width_r = width % 16;
  12. size_t width16 = width — width_r; // width16 == 16*x
  13. size_t width8 = width_r — (width_r % 8); // 8 or 0
  14. width_r -= width8;
  15. size_t width4 = width_r — (width_r % 4); // 4 or 0
  16. width_r -= width4; // 0, 1, 2, or 3
  17. __m128i x0, x1, sum;
  18. sum = _mm_setzero_si128();
  19. uint64_t sum_tail = 0;
  20. for(int i = 0; i < height; i++)
  21. {
  22. for(int j = 0; j < width16; j += 16)
  23. {
  24. /* SAD calculation */
  25. }
  26. if( width8)
  27. {
  28. x0 = _mm_loadl_epi64((__m128i*)(src0 + width16));
  29. x1 = _mm_loadl_epi64((__m128i*)(src1 + width16));
  30. x0 = _mm_sad_epu8(x0, x1);
  31. sum = _mm_add_epi64(sum, x0);
  32. }
  33. if( width4)
  34. {
  35. x0 = _mm_cvtsi32_si128(*(int32_t*)(src0 + width16 + width8));
  36. x1 = _mm_cvtsi32_si128(*(int32_t*)(src1 + width16 + width8));
  37. x0 = _mm_sad_epu8(x0, x1);
  38. sum = _mm_add_epi64(sum, x0);
  39. }
  40. for(int j = width — width_r; j < width; j ++)
  41. {
  42. sum_tail += abs(src0[j] — src1[j]);
  43. }
  44. src0 += src0_stride;
  45. src1 += src1_stride;
  46. }
  47. /**/
  48. }
  1. https://software.intel.com/sites/landingpage/IntrinsicsGuide
  2. https://developer.arm.com/architectures/instruction-sets/intrinsics
  3. https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/coding-for-neon---part-5-rearranging-vectors

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Elecard Company

Elecard Company

Leading provider of components and software products for analysis, monitoring, encoding, decoding and streaming digital video and audio data.