29#ifndef INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
30#define INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
32#include "mdds/global.hpp"
33#include "../types.hpp"
42namespace mdds {
namespace mtv {
namespace soa {
namespace detail {
44template<
typename Blks, lu_factor_t F>
47 void operator()(Blks& , int64_t , int64_t )
const
49 static_assert(invalid_static_int<F>,
"The loop-unrolling factor must be one of 0, 4, 8, 16, or 32.");
53template<
typename Blks>
56 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
58 int64_t n = block_store.positions.size();
60 if (start_block_index >= n)
64#pragma omp parallel for
66 for (int64_t i = start_block_index; i < n; ++i)
67 block_store.positions[i] += delta;
71template<
typename Blks>
74 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
76 int64_t n = block_store.positions.size();
78 if (start_block_index >= n)
82 int64_t len = n - start_block_index;
83 int64_t rem = len & 3;
85 len += start_block_index;
87#pragma omp parallel for
89 for (int64_t i = start_block_index; i < len; i += 4)
91 block_store.positions[i + 0] += delta;
92 block_store.positions[i + 1] += delta;
93 block_store.positions[i + 2] += delta;
94 block_store.positions[i + 3] += delta;
98 for (int64_t i = len; i < rem; ++i)
99 block_store.positions[i] += delta;
103template<
typename Blks>
106 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
108 int64_t n = block_store.positions.size();
110 if (start_block_index >= n)
114 int64_t len = n - start_block_index;
115 int64_t rem = len & 7;
117 len += start_block_index;
119#pragma omp parallel for
121 for (int64_t i = start_block_index; i < len; i += 8)
123 block_store.positions[i + 0] += delta;
124 block_store.positions[i + 1] += delta;
125 block_store.positions[i + 2] += delta;
126 block_store.positions[i + 3] += delta;
127 block_store.positions[i + 4] += delta;
128 block_store.positions[i + 5] += delta;
129 block_store.positions[i + 6] += delta;
130 block_store.positions[i + 7] += delta;
134 for (int64_t i = len; i < rem; ++i)
135 block_store.positions[i] += delta;
139template<
typename Blks>
142 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
144 int64_t n = block_store.positions.size();
146 if (start_block_index >= n)
150 int64_t len = n - start_block_index;
151 int64_t rem = len & 15;
153 len += start_block_index;
155#pragma omp parallel for
157 for (int64_t i = start_block_index; i < len; i += 16)
159 block_store.positions[i + 0] += delta;
160 block_store.positions[i + 1] += delta;
161 block_store.positions[i + 2] += delta;
162 block_store.positions[i + 3] += delta;
163 block_store.positions[i + 4] += delta;
164 block_store.positions[i + 5] += delta;
165 block_store.positions[i + 6] += delta;
166 block_store.positions[i + 7] += delta;
167 block_store.positions[i + 8] += delta;
168 block_store.positions[i + 9] += delta;
169 block_store.positions[i + 10] += delta;
170 block_store.positions[i + 11] += delta;
171 block_store.positions[i + 12] += delta;
172 block_store.positions[i + 13] += delta;
173 block_store.positions[i + 14] += delta;
174 block_store.positions[i + 15] += delta;
178 for (int64_t i = len; i < rem; ++i)
179 block_store.positions[i] += delta;
183template<
typename Blks>
186 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
188 int64_t n = block_store.positions.size();
190 if (start_block_index >= n)
194 int64_t len = n - start_block_index;
195 int64_t rem = len & 31;
197 len += start_block_index;
199#pragma omp parallel for
201 for (int64_t i = start_block_index; i < len; i += 32)
203 block_store.positions[i + 0] += delta;
204 block_store.positions[i + 1] += delta;
205 block_store.positions[i + 2] += delta;
206 block_store.positions[i + 3] += delta;
207 block_store.positions[i + 4] += delta;
208 block_store.positions[i + 5] += delta;
209 block_store.positions[i + 6] += delta;
210 block_store.positions[i + 7] += delta;
211 block_store.positions[i + 8] += delta;
212 block_store.positions[i + 9] += delta;
213 block_store.positions[i + 10] += delta;
214 block_store.positions[i + 11] += delta;
215 block_store.positions[i + 12] += delta;
216 block_store.positions[i + 13] += delta;
217 block_store.positions[i + 14] += delta;
218 block_store.positions[i + 15] += delta;
219 block_store.positions[i + 16] += delta;
220 block_store.positions[i + 17] += delta;
221 block_store.positions[i + 18] += delta;
222 block_store.positions[i + 19] += delta;
223 block_store.positions[i + 20] += delta;
224 block_store.positions[i + 21] += delta;
225 block_store.positions[i + 22] += delta;
226 block_store.positions[i + 23] += delta;
227 block_store.positions[i + 24] += delta;
228 block_store.positions[i + 25] += delta;
229 block_store.positions[i + 26] += delta;
230 block_store.positions[i + 27] += delta;
231 block_store.positions[i + 28] += delta;
232 block_store.positions[i + 29] += delta;
233 block_store.positions[i + 30] += delta;
234 block_store.positions[i + 31] += delta;
238 for (int64_t i = len; i < rem; ++i)
239 block_store.positions[i] += delta;
245template<
typename Blks>
248 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
251 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
252 "This code works only when the position values are 64-bit wide.");
254 int64_t n = block_store.positions.size();
256 if (start_block_index >= n)
260 int64_t len = n - start_block_index;
265 len += start_block_index;
267 __m128i right = _mm_set_epi64x(delta, delta);
270#pragma omp parallel for
272 for (int64_t i = start_block_index; i < len; i += 2)
274 __m128i* dst = (__m128i*)&block_store.positions[i];
275 _mm_storeu_si128(dst, _mm_add_epi64(_mm_loadu_si128(dst), right));
279 block_store.positions[len] += delta;
283template<
typename Blks>
284struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu4>
286 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
289 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
290 "This code works only when the position values are 64-bit wide.");
292 int64_t n = block_store.positions.size();
294 if (start_block_index >= n)
298 int64_t len = n - start_block_index;
299 int64_t rem = len & 7;
301 len += start_block_index;
303 __m128i right = _mm_set_epi64x(delta, delta);
306#pragma omp parallel for
308 for (int64_t i = start_block_index; i < len; i += 8)
310 __m128i* dst0 = (__m128i*)&block_store.positions[i];
311 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
313 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
314 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
316 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
317 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
319 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
320 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
324 for (int64_t i = len; i < rem; ++i)
325 block_store.positions[i] += delta;
329template<
typename Blks>
330struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu8>
332 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
335 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
336 "This code works only when the position values are 64-bit wide.");
338 int64_t n = block_store.positions.size();
340 if (start_block_index >= n)
344 int64_t len = n - start_block_index;
345 int64_t rem = len & 15;
347 len += start_block_index;
349 __m128i right = _mm_set_epi64x(delta, delta);
352#pragma omp parallel for
354 for (int64_t i = start_block_index; i < len; i += 16)
356 __m128i* dst0 = (__m128i*)&block_store.positions[i];
357 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
359 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
360 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
362 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
363 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
365 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
366 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
368 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
369 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
371 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
372 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
374 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
375 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
377 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
378 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
382 for (int64_t i = len; i < rem; ++i)
383 block_store.positions[i] += delta;
387template<
typename Blks>
388struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu16>
390 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
393 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
394 "This code works only when the position values are 64-bit wide.");
396 int64_t n = block_store.positions.size();
398 if (start_block_index >= n)
402 int64_t len = n - start_block_index;
403 int64_t rem = len & 31;
405 len += start_block_index;
407 __m128i right = _mm_set_epi64x(delta, delta);
410#pragma omp parallel for
412 for (int64_t i = start_block_index; i < len; i += 32)
414 __m128i* dst0 = (__m128i*)&block_store.positions[i];
415 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
417 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
418 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
420 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
421 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
423 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
424 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
426 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
427 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
429 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
430 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
432 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
433 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
435 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
436 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
438 __m128i* dst16 = (__m128i*)&block_store.positions[i + 16];
439 _mm_storeu_si128(dst16, _mm_add_epi64(_mm_loadu_si128(dst16), right));
441 __m128i* dst18 = (__m128i*)&block_store.positions[i + 18];
442 _mm_storeu_si128(dst18, _mm_add_epi64(_mm_loadu_si128(dst18), right));
444 __m128i* dst20 = (__m128i*)&block_store.positions[i + 20];
445 _mm_storeu_si128(dst20, _mm_add_epi64(_mm_loadu_si128(dst20), right));
447 __m128i* dst22 = (__m128i*)&block_store.positions[i + 22];
448 _mm_storeu_si128(dst22, _mm_add_epi64(_mm_loadu_si128(dst22), right));
450 __m128i* dst24 = (__m128i*)&block_store.positions[i + 24];
451 _mm_storeu_si128(dst24, _mm_add_epi64(_mm_loadu_si128(dst24), right));
453 __m128i* dst26 = (__m128i*)&block_store.positions[i + 26];
454 _mm_storeu_si128(dst26, _mm_add_epi64(_mm_loadu_si128(dst26), right));
456 __m128i* dst28 = (__m128i*)&block_store.positions[i + 28];
457 _mm_storeu_si128(dst28, _mm_add_epi64(_mm_loadu_si128(dst28), right));
459 __m128i* dst30 = (__m128i*)&block_store.positions[i + 30];
460 _mm_storeu_si128(dst30, _mm_add_epi64(_mm_loadu_si128(dst30), right));
464 for (int64_t i = len; i < rem; ++i)
465 block_store.positions[i] += delta;
473template<
typename Blks>
474struct adjust_block_positions<Blks, lu_factor_t::avx2_x64>
476 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
479 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
480 "This code works only when the position values are 64-bit wide.");
482 int64_t n = block_store.positions.size();
484 if (start_block_index >= n)
488 int64_t len = n - start_block_index;
489 int64_t rem = len & 3;
491 len += start_block_index;
493 __m256i right = _mm256_set1_epi64x(delta);
496#pragma omp parallel for
498 for (int64_t i = start_block_index; i < len; i += 4)
500 __m256i* dst = (__m256i*)&block_store.positions[i];
501 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
505 for (int64_t i = len; i < rem; ++i)
506 block_store.positions[i] += delta;
510template<
typename Blks>
511struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu4>
513 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
516 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
517 "This code works only when the position values are 64-bit wide.");
519 int64_t n = block_store.positions.size();
521 if (start_block_index >= n)
525 int64_t len = n - start_block_index;
526 int64_t rem = len & 15;
528 len += start_block_index;
530 __m256i right = _mm256_set1_epi64x(delta);
533#pragma omp parallel for
535 for (int64_t i = start_block_index; i < len; i += 16)
537 __m256i* dst = (__m256i*)&block_store.positions[i];
538 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
540 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
541 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
543 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
544 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
546 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
547 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
551 for (int64_t i = len; i < rem; ++i)
552 block_store.positions[i] += delta;
556template<
typename Blks>
557struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu8>
559 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
562 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
563 "This code works only when the position values are 64-bit wide.");
565 int64_t n = block_store.positions.size();
567 if (start_block_index >= n)
571 int64_t len = n - start_block_index;
572 int64_t rem = len & 31;
574 len += start_block_index;
576 __m256i right = _mm256_set1_epi64x(delta);
579#pragma omp parallel for
581 for (int64_t i = start_block_index; i < len; i += 32)
583 __m256i* dst = (__m256i*)&block_store.positions[i];
584 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
586 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
587 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
589 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
590 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
592 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
593 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
595 __m256i* dst16 = (__m256i*)&block_store.positions[i + 16];
596 _mm256_storeu_si256(dst16, _mm256_add_epi64(_mm256_loadu_si256(dst16), right));
598 __m256i* dst20 = (__m256i*)&block_store.positions[i + 20];
599 _mm256_storeu_si256(dst20, _mm256_add_epi64(_mm256_loadu_si256(dst20), right));
601 __m256i* dst24 = (__m256i*)&block_store.positions[i + 24];
602 _mm256_storeu_si256(dst24, _mm256_add_epi64(_mm256_loadu_si256(dst24), right));
604 __m256i* dst28 = (__m256i*)&block_store.positions[i + 28];
605 _mm256_storeu_si256(dst28, _mm256_add_epi64(_mm256_loadu_si256(dst28), right));
609 for (int64_t i = len; i < rem; ++i)
610 block_store.positions[i] += delta;
Definition: soa/block_util.hpp:46