This documentation is automatically generated by online-judge-tools/verification-helper
#include "library/simd/prefix-sum.hpp"
#pragma once
#include <x86intrin.h>
#include <limits>
#include <algorithm>
namespace felix {
namespace simd {
inline void prefix_sum(int* a, int n) {
int last = 0;
for(int i = 0; i + 3 < n; i += 4) {
// Compute partial sum for each block of size 4.
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
cur = _mm_add_epi32(cur, _mm_slli_si128(cur, 4));
cur = _mm_add_epi32(cur, _mm_slli_si128(cur, 8));
// Accumlate sum from previous block.
__m128i lastv = _mm_set1_epi32(last);
cur = _mm_add_epi32(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 3];
}
// Compute partial sum for remaining elements.
for(int i = (n & (~3)) + (n < 4); i < n; i++) {
a[i] += a[i - 1];
}
}
inline void prefix_sum(long long* a, int n) {
long long last = 0;
for(int i = 0; i + 1 < n; i += 2) {
a[i + 1] += a[i];
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
__m128i lastv = _mm_set1_epi64x(last);
cur = _mm_add_epi64(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 1];
}
if(n & 1) {
a[n - 1] += a[n - 2];
}
}
inline void prefix_max(int* a, int n) {
int last = std::numeric_limits<int>::min();
for(int i = 0; i + 3 < n; i += 4) {
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
cur = _mm_max_epi32(cur, _mm_slli_si128(cur, 4));
cur = _mm_max_epi32(cur, _mm_slli_si128(cur, 8));
__m128i lastv = _mm_set1_epi32(last);
cur = _mm_max_epi32(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 3];
}
for(int i = (n & (~3)) + (n < 4); i < n; i++) {
a[i] = std::max(a[i], a[i - 1]);
}
}
inline void prefix_max(long long* a, int n) {
long long last = std::numeric_limits<int>::min();
for(int i = 0; i + 1 < n; i += 2) {
a[i + 1] = std::max(a[i + 1], a[i]);
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
__m128i lastv = _mm_set1_epi64x(last);
cur = _mm_max_epi64(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 1];
}
if(n & 1) {
a[n - 1] = std::max(a[n - 1], a[n - 2]);
}
}
inline void prefix_min(int* a, int n) {
int last = std::numeric_limits<int>::max();
for(int i = 0; i + 3 < n; i += 4) {
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
cur = _mm_min_epi32(cur, _mm_slli_si128(cur, 4));
cur = _mm_min_epi32(cur, _mm_slli_si128(cur, 8));
__m128i lastv = _mm_set1_epi32(last);
cur = _mm_min_epi32(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 3];
}
for(int i = (n & (~3)) + (n < 4); i < n; i++) {
a[i] = std::min(a[i], a[i - 1]);
}
}
inline void prefix_min(long long* a, int n) {
long long last = std::numeric_limits<int>::max();
for(int i = 0; i + 1 < n; i += 2) {
a[i + 1] = std::min(a[i + 1], a[i]);
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
__m128i lastv = _mm_set1_epi64x(last);
cur = _mm_min_epi64(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 1];
}
if(n & 1) {
a[n - 1] = std::min(a[n - 1], a[n - 2]);
}
}
} // namespace simd
} // namespace felix
#line 2 "library/simd/prefix-sum.hpp"
#include <x86intrin.h>
#include <limits>
#include <algorithm>
namespace felix {
namespace simd {
inline void prefix_sum(int* a, int n) {
int last = 0;
for(int i = 0; i + 3 < n; i += 4) {
// Compute partial sum for each block of size 4.
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
cur = _mm_add_epi32(cur, _mm_slli_si128(cur, 4));
cur = _mm_add_epi32(cur, _mm_slli_si128(cur, 8));
// Accumlate sum from previous block.
__m128i lastv = _mm_set1_epi32(last);
cur = _mm_add_epi32(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 3];
}
// Compute partial sum for remaining elements.
for(int i = (n & (~3)) + (n < 4); i < n; i++) {
a[i] += a[i - 1];
}
}
inline void prefix_sum(long long* a, int n) {
long long last = 0;
for(int i = 0; i + 1 < n; i += 2) {
a[i + 1] += a[i];
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
__m128i lastv = _mm_set1_epi64x(last);
cur = _mm_add_epi64(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 1];
}
if(n & 1) {
a[n - 1] += a[n - 2];
}
}
inline void prefix_max(int* a, int n) {
int last = std::numeric_limits<int>::min();
for(int i = 0; i + 3 < n; i += 4) {
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
cur = _mm_max_epi32(cur, _mm_slli_si128(cur, 4));
cur = _mm_max_epi32(cur, _mm_slli_si128(cur, 8));
__m128i lastv = _mm_set1_epi32(last);
cur = _mm_max_epi32(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 3];
}
for(int i = (n & (~3)) + (n < 4); i < n; i++) {
a[i] = std::max(a[i], a[i - 1]);
}
}
inline void prefix_max(long long* a, int n) {
long long last = std::numeric_limits<int>::min();
for(int i = 0; i + 1 < n; i += 2) {
a[i + 1] = std::max(a[i + 1], a[i]);
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
__m128i lastv = _mm_set1_epi64x(last);
cur = _mm_max_epi64(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 1];
}
if(n & 1) {
a[n - 1] = std::max(a[n - 1], a[n - 2]);
}
}
inline void prefix_min(int* a, int n) {
int last = std::numeric_limits<int>::max();
for(int i = 0; i + 3 < n; i += 4) {
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
cur = _mm_min_epi32(cur, _mm_slli_si128(cur, 4));
cur = _mm_min_epi32(cur, _mm_slli_si128(cur, 8));
__m128i lastv = _mm_set1_epi32(last);
cur = _mm_min_epi32(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 3];
}
for(int i = (n & (~3)) + (n < 4); i < n; i++) {
a[i] = std::min(a[i], a[i - 1]);
}
}
inline void prefix_min(long long* a, int n) {
long long last = std::numeric_limits<int>::max();
for(int i = 0; i + 1 < n; i += 2) {
a[i + 1] = std::min(a[i + 1], a[i]);
__m128i cur = _mm_load_si128((__m128i*) &a[i]);
__m128i lastv = _mm_set1_epi64x(last);
cur = _mm_min_epi64(lastv, cur);
_mm_store_si128((__m128i*) &a[i], cur);
last = a[i + 1];
}
if(n & 1) {
a[n - 1] = std::min(a[n - 1], a[n - 2]);
}
}
} // namespace simd
} // namespace felix